示例#1
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url),..] """
     urls = []
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = CONNECTION_TIMEOUT+15 if isfulltext else CONNECTION_TIMEOUT
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 feed = feedparser.parse(result.content.decode(self.feed_encoding))
             else:
                 feed = feedparser.parse(AutoDecoder().decode(result.content))
             
             urladded = [] # 防止部分RSS产生重复文章
             for e in feed['entries'][:self.max_articles_per_feed]:
                 url = e.link
                 if url not in urladded:
                     if isfulltext:
                         desc = e.content[0].value if hasattr(e, 'content') and e.content[0].value else e.summary
                         urls.append((section, e.title, url, desc if desc else u'Has no summary, is it fulltext feed?'))
                     else:
                         urls.append((section, e.title, url, None))
                     urladded.append(url)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     return urls
示例#2
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = self.timeout+10 if isfulltext else self.timeout
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 feed = feedparser.parse(result.content.decode(self.feed_encoding))
             else:
                 feed = feedparser.parse(AutoDecoder().decode(result.content))
             
             urladded = set() # 防止部分RSS产生重复文章
             for e in feed['entries'][:self.max_articles_per_feed]:
                 #支持HTTPS
                 urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                 if urlfeed not in urladded:
                     desc = None
                     if isfulltext:
                         if hasattr(e, 'content') and e.content[0].value:
                             desc = e.content[0].value
                         elif hasattr(e, 'summary'):
                             desc = e.summary
                         else:
                             self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
                     urls.append((section, e.title, urlfeed, desc))
                     urladded.add(urlfeed)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     return urls
示例#3
0
def updateFeed(feed):
    """
    function to update the content of a feed
    """
    
    feedData = feedparser.parse(feed.url, etag = feed.etag, modified = time.localtime(feed.lastModified))
    try:
        if feedData.status == 301:
            print "feed url modified. trying the new url..."
            feedData = feedparser.parse(feedData.url, etag = feed.etag, modified = time.localtime(feed.lastModified))
            
        if feedData.status == 304:
            print "No updates"
            
        else:
            print feedData.status
                    
            lastModified = time.localtime(feed.lastModified)
            feed.lastModified = time.mktime(feedData.modified)
            
            for item in feedData.entries:
                if item.updated_parsed > lastModified:
                    _addItem(feed, item)
                    print "Added %s to the database." % item.title #comment this later
    except AttributeError:
        print " Error fetching feeds, Network error???"
示例#4
0
 def parsefeed(self, feed, retires=1):
     """parse feed using feedparser"""
     try:  # 访问feed,自动尝试在地址结尾加上或去掉'/'
         feed_data = feedparser.parse(feed.encode('utf-8'))
         if not feed_data.feed.has_key('title'):
             if feed[-1] == '/':
                 feed_data = feedparser.parse(feed[0:-1].encode('utf-8'))
             elif feed[-1] != '/':
                 feed_data = feedparser.parse((feed + '/').encode('utf-8'))
             if not feed_data.feed.has_key('title'):
                 raise UserWarning("read error")
             else:
                 return feed_data
         else:
             return feed_data
     except UserWarning:
         logging.error("fail({}): {}".format(feed, "read error"))
         return None
     except Exception, e:
         if retires > 0:
             logging.error("error({}): {} , retry".format(feed, e))
             return self.parsefeed(feed, retires - 1)  # 如果读取错误,重试一次
         else:
             logging.error("fail({}): {}".format(feed, e))
             return None
示例#5
0
 def parsefeed(self, feed, retires = 1):
     """parse feed using feedparser"""
     try:  # 访问feed,自动尝试在地址结尾加上或去掉'/'
         feed_data = feedparser.parse(feed.encode('utf-8'))
         if not feed_data.feed.has_key('title'):
             if feed[-1] == '/':
                 feed_data = feedparser.parse(feed[0:-1].encode('utf-8'))
             elif feed[-1] != '/':
                 feed_data = feedparser.parse((feed + '/').encode('utf-8'))
             if not feed_data.feed.has_key('title'):
                 raise UserWarning("read error")
             else:
                 return feed_data
         else:
             return feed_data
     except UserWarning:
         logging.error("fail({}): {}".format(feed, "read error"))
         return None
     except Exception, e:
         if retires > 0:
             logging.error("error({}): {} , retry".format(feed, e))
             return self.parsefeed(feed, retires - 1)  # 如果读取错误,重试一次
         else:
             logging.error("fail({}): {}".format(feed, e))
             return None
示例#6
0
def updateFeed(feed):
    """
    function to update the content of a feed
    """

    feedData = feedparser.parse(feed.url, etag = feed.etag, modified = time.localtime(feed.lastModified))
    try:
        if feedData.status == 301:
            print "feed url modified. trying the new url..."
            feedData = feedparser.parse(feedData.url, etag = feed.etag, modified = time.localtime(feed.lastModified))

        if feedData.status == 304:
            print "No updates"

        else:
            print feedData.status

            lastModified = time.localtime(feed.lastModified)
            feed.lastModified = time.mktime(feedData.modified)

            for item in feedData.entries:
                if item.updated_parsed > lastModified:
                    _addItem(feed, item)
                    print "Added %s to the database." % item.title #comment this later
    except AttributeError:
        print " Error fetching feeds, Network error???"
示例#7
0
文件: base.py 项目: lovejoy/KindleEar
 def ParseFeedUrls(self):
     """ return list like [(section,title,url),..] """
     urls = []
     opener = URLOpener(self.host)
     decoder = AutoDecoder()
     for section, url in self.feeds:
         result = opener.open(url)
         if result.status_code == 200:
             if self.feed_encoding:
                 feed = feedparser.parse(result.content.decode(self.feed_encoding))
             else:
                 feed = feedparser.parse(decoder.decode(result.content))
             for e in feed['entries'][:self.max_articles_per_feed]:
                 urls.append((section,e.title,e.link))
     return urls
示例#8
0
def main():
    optp = OptionParser()

    optp.add_option("-a", "--auth", dest="auth", help="GMail http basic auth hash")
    optp.add_option("-b", "--botjid", dest="botJid", help="bot's jabber id")
    optp.add_option("-p", "--botpass", dest="botPass", help="bot's jabber password")
    optp.add_option("-t", "--targetjid", dest="targetJid", help="message recepient jid")

    opts, args = optp.parse_args()


    request = urllib2.Request("https://mail.google.com/mail/feed/atom")
    request.add_header("Authorization", "Basic " + opts.auth)
    result = urllib2.urlopen(request)

    rss = feedparser.parse(result.read())

    if (len(rss['entries']) > 0):

        f = open(os.path.dirname(__file__) + '/msgs.list', 'r+a')
        seens = f.read().split("\n")

        for item in rss['entries']:
            if (item['id'] not in seens and item['author'].find(opts.botJid) < 0):
                sendJabberMessage("*%s* \n%s\n\n%s" % (item['title'], item['author'], item['summary']), opts)
                f.write(item['id'] + "\n")
示例#9
0
文件: work.py 项目: tonychi/readLater
    def get(self):

        feed_id, feed_url = self.request.get('feed_id'), \
                self.request.get('feed_url')

        url_result = urllib2.urlopen(feed_url)
        feed_result = fd.parse(url_result)

        if feed_result.bozo == 1:
            logging.error('fetch error, id: %s, url: %s, error: %s', 
                    feed_id, feed_url, feed_result.bozo_exception)
            return

        feed_update_time = feed_result.get('updated', datetime.utcnow)
        has_update = True 

        f = Feed.get_by_id(int(feed_id))
        if f.is_allow_fetch(feed_update_time):
            for entry in feed_result.entries:
                if entry.published_parsed <= f.lastedPublishedTime:
                    logging.info('no updated, id: %s, url: %s', feed_id,
                            feed_url)
                    has_update = False
                    break

                e = Entry(title = entry.title,
                          url = entry.link,
                          author = entry.author,
                          content = entry.content),
                          publishedTime = entry.published_parsed)
                e.put()
                logging.debug('fetch entry, url: %s', entry.link)
示例#10
0
    def feed_segments(self):
        segment_props = []

        response = fetch_url("http://podcastrss.play.it/the-sports-junkies_mp3_128.xml")
        if response and response.status_code == 200:
            feed = feedparser.parse(response.content)
            feed_items = feed.get('items')
            feed_items.reverse()

            classic_segment_count = 30
            for item in feed_items:
                url = str(item['enclosures'][0]['href']).split('?')[0].replace('http://www.podtrac.com/pts/redirect.mp3/', 'http://')
                props = {
                    'description': item.get('description'), \
                    'duration': int(item.get('itunes_duration', 0)), \
                    'date': datetime.fromtimestamp(mktime(item['updated_parsed'])).date(), \
                    'url': url,
                }
                if is_classic_segment(item):
                    props['is_classic'] = True
                    props['num'] = classic_segment_count
                    classic_segment_count += 1
                else:
                    props['is_classic'] = False
                    props['num'] = parse_segment_number(item['title'])

                printer(self.response.out, "%s\n" % props)
                segment_props.append(props)

        return segment_props
示例#11
0
def addFeed(feedUrl):
    """
    Function to add a new feed to the database.
    """
    try:
        feedData = feedparser.parse(feedUrl)
    except:
        #this never occurs since parser does not raise any exceptions when invalid url is sent
        print "Invalid feed Url!"

    else:
        try:
            newFeed = Feed(url=unicode(feedUrl),
                           title=feedData.feed.title,
                           lastModified=time.mktime(feedData.modified),
                           etag=unicode(feedData.etag))

            session.commit()

        except AttributeError:
            session.rollback()
            print "Error! Invalid feed URL"
        except:
            session.rollback()
            print "%s \t Feed already subscribed" % (feedData.feed.title)

        else:
            print "Subscribed to \t %s " % (feedData.feed.title)
            fetchFeeds(newFeed, feedData)
示例#12
0
 def get_messages(self, account, url):
     user = self.session.get_user()
     response, content = oauth.CybozuliveHandler.request(user, account, url)
     if response["status"] != "200":
         raise Exception(response["status"] +
                         " failed to get messages. : " + url)
     result = feedparser.parse(content)
     messages = []
     for entry in result.entries:
         messages.append({
             "id":
             entry.id,
             "title":
             entry.title,
             "link":
             entry.link,
             "author":
             entry.author,
             "summary":
             re.sub("\n", '<br/>', utils.escape_html(entry.summary))
             if hasattr(entry, "summary") else "",
             "updated":
             datetime.datetime(
                 *entry.updated_parsed[:6]).strftime("%a %b %d %H:%M:%S %Y")
         })
     template_values = {
         'service': 'cybozulive',
         "title": result.feed.title,
         "link": result.feed.link,
         "feed_url": url,
         'messages': messages
     }
     return template_values
示例#13
0
文件: rss.py 项目: packetlost/crowy
 def get(self, action=""):
     if action == "messages":
         url = self.request.get('type')
         d = memcache.get(url)
         if d is None:
             result = urllib.urlopen(url)
             d = feedparser.parse(result)
             memcache.set(url, d, 2*60) #2分キャッシュ
             #RSSの形式が規格外の場合
             #if d.bozo == 1:
             #    raise Exception("Can not parse given URL.")
         response = {
             "title": d.feed.get("title"),
             "link": d.feed.get("link"),
             "feed_url": url,
             "messages": []
         }
         for entry in d.entries:
             response["messages"].append({
                 "title": entry.get("title"),
                 "link": entry.get("link"),
                 "updated": datetime.datetime(*entry.updated_parsed[:6]).strftime("%a %b %d %H:%M:%S %Y")
             })
         feed_json = simplejson.dumps(response)
         self.response.headers["Cache-Control"] = "public, max-age=120"
         self.response.headers["Content-Type"] = "application/json"
         return self.response.out.write(feed_json)
     elif action == "add_column":
         tmpl = os.path.join(os.path.dirname(__file__), "../view/rss_add_column.html")
         return self.response.out.write(template.render(tmpl, {}))
     self.error(400)
示例#14
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1].replace('gzh', 'gzhjs')
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout+10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                content = content[content.index('{'):content.index('}')+1]
                content = json.loads(content)

                for e in content['items'][:self.max_articles_per_feed]:
                    e = feedparser.parse(e)['entries'][0]
                    updated = None
                    if hasattr(e, 'lastmodified') and e.lastmodified:
                        updated = float(e.lastmodified)

                    if self.oldest_article > 0 and updated:
                        updated = datetime.datetime.utcfromtimestamp(updated)
                        delta = tnow - updated
                        if self.oldest_article > 365:
                            threshold = self.oldest_article #以秒为单位
                        else:
                            threshold = 86400*self.oldest_article #以天为单位

                        if delta.days*86400+delta.seconds > threshold:
                            self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href))
                            continue

                    #支持HTTPS
                    if hasattr(e, 'href'):
                        if url.startswith('https://'):
                            urlfeed = e.href.replace('http://','https://')
                        else:
                            urlfeed = e.href

                        if urlfeed in urladded:
                            continue
                    else:
                        urlfeed = ''

                    desc = None
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))

        return urls
示例#15
0
文件: views.py 项目: ryuic/newsy
def blog_search(query):
    cache_key = "newsy-blog-" + sha.new(query).hexdigest()
    entries = cache.get(cache_key)

    if entries == None:
        url = "http://blogsearch.google.co.jp/blogsearch_feeds?"
        params = {
            'hl' : 'en', 
            'q' : query,
            'lr' : 'lang_en',
            'ie' : 'utf-8',
            'num'  : 25,
            'output' : 'rss'}

        d = feedparser.parse(url + urllib.urlencode(params))
        entries = []
        for e in d.entries:
            if 'summary' in e: summary = e.summary
            else: summary = e.description
            entries.append({
                'title' : e.title,
                'link' : e.link,
                'description' : e.summary
            })

    return entries
示例#16
0
def get_profiles(accounts):
    profiles = []
    max_results = 100
    for account in accounts:
        if account.service != "cybozulive" or account.access_token == None:
            continue
        # TODO 必ずプロフィール情報を更新するのはやめたい
        start_index = 0
        while True:
            response, content = oauth.CybozuliveHandler.request_with_account(
                account,
                "https://api.cybozulive.com/api/group/V2?max-results=%s&start-index=%s"
                % (str(max_results), str(start_index)))
            account.account_info = unicode(content, 'utf-8')
            account.put()
            account_info = feedparser.parse(account.account_info)
            for group in account_info.entries:
                profiles.append({
                    "service":
                    "cybozulive",
                    "account_name":
                    account.account_name,
                    "url":
                    "cybozulive/post/" + account.account_name + "/" +
                    group.id.split(",")[1],
                    "name":
                    group.title + "/" + account.account_name
                })
            if len(account_info.entries) < max_results:
                break
            start_index += max_results
    return profiles
示例#17
0
  def fetch(self):
    try:
      result = urlfetch.fetch(self.uri.encode('utf-8'))
    except:
      self.error = 'Can’t Fetch'
      return None
    if result.status_code != 200:
      self.error = 'Can’t Fetch (%d)' % result.status_code
      return None

    try:
      rss = feedparser.parse(result.content)
    except:
      self.error = 'Wrong RSS Format'
      return None

    if not rss or rss.bozo == 1:
      self.error = 'Wrong RSS Format'
      return None

    #URL, タイトル、日付だけ取り出す
    self.error = ''
    self.title = rss.channel.title
    self.entries = []
    for entry in rss.entries:
      e = Entry()
      e.title = entry.title
      e.link = entry.link
      e.updated = entry.updated
      self.entries.append(e)

    return self
示例#18
0
def addFeed(feedUrl):
    """
    Function to add a new feed to the database.
    """
    try:    
        feedData = feedparser.parse(feedUrl)
    except:
        #this never occurs since parser does not raise any exceptions when invalid url is sent
        print "Invalid feed Url!"
        
    else:
        try:
            newFeed = Feed(url = unicode(feedUrl), title = feedData.feed.title, 
                lastModified = time.mktime(feedData.modified), 
                etag = unicode(feedData.etag))
                
            session.commit()
            
        except AttributeError:
            session.rollback()
            print "Error! Invalid feed URL"
        except:
            session.rollback()
            print "%s \t Feed already subscribed" % (feedData.feed.title)
            
        else:
            print "Subscribed to \t %s " % (feedData.feed.title)
            fetchFeeds(newFeed, feedData)
示例#19
0
def import_posts(commit=True):
    ideas = Idea.all().fetch(1000)
    #ideas = [Idea.get_by_id(9)]
    print 'Importing posts for %s idea(s)...' % len(ideas)

    to_put = []
    for idea in ideas:
        soup = make_soup(idea.source_url)

        # We get the idea's actual body from the RSS feed
        rss = feedparser.parse(idea_feed_url(idea))
        body = rss.feed.subtitle.replace(
            '\nFeed Created by spigit.com feed manager.', '')
        idea.body = clean_body(body)
        to_put.append(idea)

        headers = soup.find('td', 'main')\
            .findAll('div', 'commentheader', recursive=False)
        for header in headers:
            content = header.findNextSiblings('div', limit=1)[0]
            post = make_post(idea, header, content, commit=False)
            to_put.extend(post)

    to_put = filter(None, to_put)

    if commit:
        db.put(to_put)

    return to_put
示例#20
0
    def get_rssfeed_parsed(self, rssfeed_data, cookies=None, cookie_header={}):
        """
        rssfeed_data: A dictionary containing rss feed data as stored in the YaRSS2 config.
        cookies: A dictionary of cookie values as stored in the YaRSS2 config. cookie_header paramamer will not be used
        cookie_header: A dictionary of cookie values as returned by yarss2.http.get_cookie_header.
        """
        return_dict = {}
        rssfeeds_dict = {}

        if cookies:
            cookie_header = http.get_cookie_header(cookies,
                                                   rssfeed_data["site"])

        self.log.info("Fetching RSS Feed: '%s' with Cookie: '%s'." %
                      (rssfeed_data["name"], cookie_header))

        # Will abort after 10 seconds if server doesn't answer
        try:
            parsed_feeds = feedparser.parse(rssfeed_data["url"],
                                            request_headers=cookie_header,
                                            agent=self.agent,
                                            timeout=10)
        except Exception, e:
            self.log.warn("Exception occured in feedparser:" + str(e))
            self.log.warn(
                "Feedparser was called with url: '%s' and header: '%s'" %
                (rssfeed_data["url"], cookie_header))
            self.log.warn("Stacktrace:" + common.get_exception_string())
            return None
示例#21
0
  def post(self):
    stream = FeedStream.get(db.Key(self.request.POST.get("key")))
    if stream is None:
      logging.warn("feedstream not found for subscription request")
      self.response.out.write("feedstream not found for subscription request")
      self.error(404)
      return

    feed = feedparser.parse(stream.url)
    if hasattr(feed, 'feed') and hasattr(feed.feed, 'links'):
      hub_url = find_feed_url('hub', feed.feed.links)
      if hub_url is None:
        logging.info("no hub found for: %s" % stream.url)
        self.response.out.write('no hub found')
        return
      else:
        logging.info("sending pshb subscription request for: %s" % stream.url)
        stream.pshb_hub_url = hub_url
        stream.put()
        self.subscribe_to_topic(stream, hub_url)
        self.response.out.write('sent subscription request')
        return

    logging.warn('could not parse feed unable to initiate subscription')
    self.response.out.write('could not parse feed unable to initiate subscription')
    self.error(400)
示例#22
0
def get_feed(channel_id):
       
    feedurl = 'https://www.youtube.com/feeds/videos.xml?channel_id=' + str(channel_id)
    try:
        d = feedparser.parse(feedurl)
    except Exception as e:
        logging.info('exception caught {}'.format(e))
示例#23
0
def main(bot, user, target, msg):
    feed = feedparser.parse(bot.memory['feeds']['civfanatics']['url'])
    bot.message(
        user, target, feed.feed.title + ' :: ' + feed.entries[0].title + ': ' +
        feed.entries[0].summary_detail.value + ' - ' +
        feed.entries[0].updated + ' - ' +
        xrl.xrl_encoder(connection, event, feed.entries[0].link))
    bot.memory['feeds']['civfanatics']['last_title'] = feed.feed.title
示例#24
0
 def parse_feed(self, feed):
   """Helper method to handle conditional HTTP stuff"""
   try:
     logging.info("Requesting Feed for: %s" % feed.url)
     if feed.http_etag is not None and len(feed.http_etag) > 0 and feed.http_last_modified is not None:
       # give feedparser back what it pulled originally, a time.struct_time object
       return feedparser.parse(feed.url, etag=feed.http_etag, modified=feed.http_last_modified.timetuple())
     if feed.http_etag is not None and len(feed.http_etag) > 0:
       return feedparser.parse(feed.url, etag=feed.http_etag)
     if feed.http_last_modified is not None:
       # give feedparser back what it pulled originally, a time.struct_time object
       return feedparser.parse(feed.url, modified=feed.http_last_modified.timetuple())
     else:
       return feedparser.parse(feed.url)
   except UnicodeDecodeError:
       logging.error("Unicode error parsing feed: %s" % feed.url)
       return None
示例#25
0
 def get_account_info(self):
     resp, content = self.oauth_request("https://api.cybozulive.com/api/group/V2")
     if resp["status"] == "200":
         content = unicode(content,'utf-8')
         d = feedparser.parse(content)
         return d.feed.author_detail.email, None, None, content
     else:
         raise Exception("failed to verify credentials")
示例#26
0
def GOODREADS(host=None, feednr=None, priority=0, dispname=None, test=False):
    """
    Goodreads RSS query function, return all the results in a list, can handle multiple wishlists
    but expects goodreads format (looks for goodreads category names)
    """
    results = []
    basehost = host
    if not str(host)[:4] == "http":
        host = 'http://' + host

    URL = host

    result, success = fetchURL(URL)

    if test:
        return success

    if success:
        data = feedparser.parse(result)
    else:
        logger.error('Error fetching data from %s: %s' % (host, result))
        BlockProvider(basehost, result)
        return []

    if data:
        logger.debug('Parsing results from %s' % URL)
        provider = data['feed']['link']
        if not dispname:
            dispname = provider
        logger.debug("RSS %s returned %i result%s" %
                     (provider, len(data.entries), plural(len(data.entries))))
        for post in data.entries:
            title = ''
            book_id = ''
            author_name = ''
            isbn = ''
            if 'title' in post:
                title = post.title
            if 'book_id' in post:
                book_id = post.book_id
            if 'author_name' in post:
                author_name = post.author_name
            if 'isbn' in post:
                isbn = post.isbn
            if title and author_name:
                results.append({
                    'rss_prov': provider,
                    'rss_feed': feednr,
                    'rss_title': title,
                    'rss_author': author_name,
                    'rss_bookid': book_id,
                    'rss_isbn': isbn,
                    'priority': priority,
                    'dispname': dispname
                })
    else:
        logger.debug('No data returned from %s' % host)
    return results
示例#27
0
def GOODREADS(host=None, feednr=None, priority=0, dispname=None, test=False):
    """
    Goodreads RSS query function, return all the results in a list, can handle multiple wishlists
    but expects goodreads format (looks for goodreads category names)
    """
    results = []
    basehost = host
    if not str(host)[:4] == "http":
        host = 'http://' + host

    URL = host

    result, success = fetchURL(URL)

    if test:
        return success

    if success:
        data = feedparser.parse(result)
    else:
        logger.error('Error fetching data from %s: %s' % (host, result))
        BlockProvider(basehost, result)
        return []

    if data:
        logger.debug('Parsing results from %s' % URL)
        provider = data['feed']['link']
        if not dispname:
            dispname = provider
        logger.debug("RSS %s returned %i result%s" % (provider, len(data.entries), plural(len(data.entries))))
        for post in data.entries:
            title = ''
            book_id = ''
            author_name = ''
            isbn = ''
            if 'title' in post:
                title = post.title
            if 'book_id' in post:
                book_id = post.book_id
            if 'author_name' in post:
                author_name = post.author_name
            if 'isbn' in post:
                isbn = post.isbn
            if title and author_name:
                results.append({
                    'rss_prov': provider,
                    'rss_feed': feednr,
                    'rss_title': title,
                    'rss_author': author_name,
                    'rss_bookid': book_id,
                    'rss_isbn': isbn,
                    'priority': priority,
                    'dispname': dispname
                })
    else:
        logger.debug('No data returned from %s' % host)
    return results
示例#28
0
def request_feed(url, **kwargs):
    """
    Wrapper for `request_response', which will return a feed object.
    """

    response = request_response(url, **kwargs)

    if response is not None:
        return feedparser.parse(response.content)
示例#29
0
def request_feed(url, **kwargs):
    """
    Wrapper for `request_response', which will return a feed object.
    """

    response = request_response(url, **kwargs)

    if response is not None:
        return feedparser.parse(response.content)
示例#30
0
文件: main.py 项目: koki/focus
def add_channel_to_db(url):
    used_groups = created_groups = 0 # debug info
    feed = feedparser.parse(url)
    if feed['bozo'] == 1:
        raise "RSS is not well-formed XML"

    items = feed['items']

    db = DataBase()
    group_name = db.get_last_group_id()
    messages = db.get_news()

    for item in items:
        item_hash = get_hashes(plain_from_html(item['summary']))
        equal = message_id = 0
        for message in messages:
            # convert string to hash
            mess_hash = []
            for x in re.split(' ', message['hashes']):
                if x != '': mess_hash.append(int(x))
                
            t_equal = compare_hashes(item_hash, mess_hash)    
            if t_equal > equal:
                equal = t_equal
                message_id = message['message_id']

        # convert hashes to string
        str_hash = u""
        for h in item_hash:
            str_hash += " %i" % h 

        if equal > 0:
            used_groups = used_groups + 1 # debug
            # add new message
            db.add_message(feed['channel']['title'],
                           plain_from_html(item['title']),
                           plain_from_html(item['summary']),
                           str_hash,
                           db.get_message_group(message_id))

        else:
            created_groups = created_groups + 1 # debug
            # create new group
            db.add_new_group("Topic %i" % group_name)
            # add new message
            db.add_message(feed['channel']['title'],
                           plain_from_html(item['title']),
                           plain_from_html(item['summary']),
                           str_hash,
                           group_name)
            # increment group id
            group_name = group_name+1

    if __debug__:
        print "new groups %i, used groups %i" % \
              (created_groups, used_groups)
示例#31
0
文件: base.py 项目: lovejoy/KindleEar
 def Items(self):
     itemsprocessed = []
     cnt4debug = 0
     opener = URLOpener(self.host)
     decoder = AutoDecoder()
     for section, url in self.feeds:
         content = None
         cnt4debug += 1
         if IsRunInLocal and cnt4debug > 1:
             break
         
         result = opener.open(url)
         status_code, content = result.status_code, result.content
         if status_code != 200 and content:
             logging.error('err(%d) to fetch %s.' % (status_code,url))
             continue
         
         if self.feed_encoding:
             content = content.decode(self.feed_encoding)
         else:
             content = decoder.decode(content)
         
         content = self.preprocess(content)
         
         feed = feedparser.parse(content)
         for e in feed['entries']:
             # 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉
             desc = self.postprocess(e.description)
             desc = self.FragToXhtml(desc, e.title, self.feed_encoding)
             
             if self.keep_image:
                 soup = BeautifulSoup(content)
                 self.soupbeforeimage(soup)
                 for img in soup.findAll('img'):
                     imgurl = img['src']
                     if not imgurl.startswith('http') and not imgurl.startswith('www'):
                         imgurl = self.urljoin(url, imgurl)
                     imgresult = opener.open(imgurl)
                     imgcontent = imgresult.content if imgresult.status_code == 200 else None
                     if imgcontent:
                         imgtype = imghdr.what(None, imgcontent)
                         if imgtype:
                             imgmime = r"image/" + imgtype
                             if imgtype == 'jpeg':
                                 fnimg = "%d.jpg" % random.randint(10000,99999999)
                             else:
                                 fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                             img['src'] = fnimg
                             yield (imgmime, imgurl, fnimg, imgcontent)
                 self.soupprocessex(soup)
                 desc = soup.renderContents('utf-8').decode('utf-8')
                 soup = None
             
             if e.title not in itemsprocessed and desc:
                 itemsprocessed.append(e.title)
                 yield (section, e.link, e.title, desc)
示例#32
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     tnow = datetime.datetime.utcnow()
     urladded = set()
     
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = self.timeout+10 if isfulltext else self.timeout
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 try:
                     content = result.content.decode(self.feed_encoding)
                 except UnicodeDecodeError:
                     content = AutoDecoder(True).decode(result.content,url)
             else:
                 content = AutoDecoder(True).decode(result.content,url)
             feed = feedparser.parse(content)
             
             for e in feed['entries'][:self.max_articles_per_feed]:
                 updated = None
                 if hasattr(e, 'updated_parsed') and e.updated_parsed:
                     updated = e.updated_parsed
                 elif hasattr(e, 'published_parsed') and e.published_parsed:
                     updated = e.published_parsed
                 elif hasattr(e, 'created_parsed'):
                     updated = e.created_parsed
                     
                 if self.oldest_article > 0 and updated:
                     delta = tnow - datetime.datetime(*(updated[0:6]))
                     if delta.days*86400+delta.seconds > 86400*self.oldest_article:
                         self.log.info("Skip old article: %s" % e.link)
                         continue
                 
                 #支持HTTPS
                 urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                 if urlfeed in urladded:
                     continue
                     
                 desc = None
                 if isfulltext:
                     if hasattr(e, 'description'):
                         desc = e.description
                     elif hasattr(e, 'content') and e.content[0]['value']:
                         desc = e.content[0]['value']
                     else:
                         self.log.warn('fulltext feed item no has desc,link to webpage for article.(%s)'%e.title)
                 urls.append((section, e.title, urlfeed, desc))
                 urladded.add(urlfeed)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     
     return urls
示例#33
0
    def ParseFeedUrls(self):
        #解析xml,返回相关信息
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1]
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout + 10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)

            if result.code == 200 and result.content:
                if self.feed_encoding:
                    content = result.content.decode(self.feed_encoding)
                else:
                    content = AutoDecoder(True).decode(result.content, url)
                feed = feedparser.parse(content)  #进行解析

                #分解得到的内容
                for e in feed['entries'][:self.
                                         max_articles_per_feed]:  #取相应数量的feed
                    if self.oldest_article > 0 and hasattr(
                            e, 'updated_parsed'):  #是否有更新
                        updated = e.updated_parsed
                        if updated:
                            delta = tnow - datetime.datetime(*(updated[0:6]))
                            #根据时间来判断要取的文章
                            if delta.days * 86400 + delta.seconds > 86400 * self.oldest_article:
                                self.log.info("Skip old article: %s" % e.link)
                                continue
                    #支持HTTPS
                    urlfeed = e.link.replace(
                        'http://',
                        'https://') if url.startswith('https://') else e.link
                    if urlfeed in urladded:
                        continue

                    desc = None
                    if isfulltext:
                        if hasattr(e, 'content') and e.content[0]['value']:
                            desc = e.content[0]['value']
                        elif hasattr(e, 'description'):
                            desc = e.description
                        else:
                            self.log.warn(
                                'fulltext feed item no has desc,link to webpage for article.(%s)'
                                % e.title)
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s' % (result.code, url))

        return urls
示例#34
0
 def cache(self):
     """cache newest feed"""
     xdm.common.MM.clearRole('news')
     self.news = []
     self.feed = feedparser.parse(XDM_FEED_URL)
     for e in self.feed.entries:
         tags = []
         for tag in e.tags:
             tags.append(tag['term'])
         self.news.append(SimpleNews(e.summary_detail.value, e.link, tags))
示例#35
0
文件: news.py 项目: Jeffroiscool/XDM
 def cache(self):
     """cache newest feed"""
     xdm.common.MM.clearRole('news')
     self.news = []
     self.feed = feedparser.parse(XDM_FEED_URL)
     for e in self.feed.entries:
         tags = []
         for tag in e.tags:
             tags.append(tag['term'])
         self.news.append(SimpleNews(e.summary_detail.value, e.link, tags))
示例#36
0
 def get(self):
     rawdata = """
     <rss channel="2.0">
     <channel>
     <title>Sample feed</title>
     </channel>
     </rss>
     """
     d = feedparser.parse(rawdata)
     self.response.write(d['feed']['title'])
     self.response.write('Hello world!')
示例#37
0
def autofeed(bot):
    for feed in feeds:
        rss = feedparser.parse(feeds[feed]['url'])
        if bot.memory['feeds'][feed]['last_title'] == feed.feed.title:
            pass
        else:
            bot.msg(
                feeds[feed], feed.feed.title + ' :: ' + feed.entries[0].title +
                ': ' + feed.entries[0].summary_detail.value + ' - ' +
                feed.entries[0].updated + ' - ' +
                xrl.xrl_encoder(feed.entries[0].link))
            feeds[feed]['last_title'] = feed.feed.title
示例#38
0
    def get(self):

        # Incoming url parameters
        # iGoogle prepends var names with 'up_' for some reason...
        subreddits = self.request.get("up_subreddits")  # Pipe separated list of subreddits for top menu
        width = self.request.get("up_width", "500")  # Truncate headline chars, default 500
        imgur_switch = self.request.get("up_imgur", 1)  # Imgur mirror, 1=imgur, 2=mirur, 3=filmot
        feed = self.request.get("r", "all")  # Current requested subreddit feed

        try:
            # Fetch and parse the feed
            rss = urlfetch.fetch(self.feed_to_url(feed), headers={"Cache-Control": "max-age=0"})
            parsed = feedparser.parse(rss.content)
            stories = []

            for entry in parsed.entries:
                comment_count = self.get_comment_count(entry.summary_detail)

                # Extract the external link url, and transform imgur links to requested mirror
                external_link = self.transform_url(self.get_external_link(entry.summary_detail), int(imgur_switch))

                # Build a hash object for each story...
                parsed_story_hash = {
                    "full_title": entry.title_detail.value,  # Full, non-truncated title just in case
                    "fixed_width_title": self.truncate(entry.title_detail.value, int(width)),
                    "external_link": external_link,
                    "comment_link": entry.link,
                    "comment_count": comment_count,
                }

                # ... and append to the stories list
                stories.append(parsed_story_hash)

            # The main data is 'stories'.  Everything else is
            # there to persist the URL parameters.
            template_data = {
                "subreddits": subreddits.split("|"),
                "link_subreddits": subreddits,
                "link_imgur": imgur_switch,
                "width": width,
                "current_feed": str(feed),
                "stories": stories,
            }

            # Finally, render the template with the data
            path = os.path.join(os.path.dirname(__file__), "index.html")
            self.response.out.write(template.render(path, template_data))

        except:
            self.response.out.write(
                '<a style="font-size:1em;font-weight:bold;text-decoration:none;" href="http://www.downforeveryoneorjustme.com/reddit.com">is reddit down?</a>'
            )
示例#39
0
文件: main.py 项目: koki/focus
def print_channel_info(url, num=0):
    feed = feedparser.parse(url)
    if feed['bozo'] == 1:
        raise "RSS is not well-formed XML"

    items = feed["items"]
    print "------------------------------------------------------------"
    print feed["channel"]["title"], feed["channel"]["link"], " (", \
          len(items), ")"
    print "------------------------------------------------------------"
    print "First message:"
    print "+ ", items[num]["title"], " +"
    print plain_from_html(items[num]["summary"])
示例#40
0
  def get(self):
   
      # Incoming url parameters
      # iGoogle prepends var names with 'up_' for some reason...
      subreddits = self.request.get('up_subreddits')    # Pipe separated list of subreddits for top menu
      width = self.request.get('up_width', '500')       # Truncate headline chars, default 500
      imgur_switch = self.request.get('up_imgur', 1)    # Imgur mirror, 1=imgur, 2=mirur, 3=filmot
      feed = self.request.get('r', 'all')               # Current requested subreddit feed
      
      try:
        # Fetch and parse the feed
        rss = urlfetch.fetch(self.feed_to_url(feed), headers = {'Cache-Control' : 'max-age=0'})
        parsed = feedparser.parse(rss.content)
        stories = []

        for entry in parsed.entries:     
          comment_count = self.get_comment_count(entry.summary_detail)
    
          # Extract the external link url, and transform imgur links to requested mirror
          external_link = self.transform_url(self.get_external_link(entry.summary_detail), int(imgur_switch))

          # Build a hash object for each story...
          parsed_story_hash = {
            'full_title' : entry.title_detail.value, # Full, non-truncated title just in case
            'fixed_width_title' : self.truncate(entry.title_detail.value, int(width)),
            'external_link' :  external_link,
            'comment_link' : entry.link,
            'comment_count' : comment_count
          }
          
          # ... and append to the stories list
          stories.append(parsed_story_hash)

        # The main data is 'stories'.  Everything else is 
        # there to persist the URL parameters.
        template_data = {
          'subreddits' : subreddits.split('|'),
          'link_subreddits' : subreddits,
          'link_imgur' : imgur_switch,
          'width' : width,
          'current_feed' : str(feed),
          'stories' : stories
        }
        
        # Finally, render the template with the data
        path = os.path.join(os.path.dirname(__file__), 'index.html')
        self.response.out.write(template.render(path, template_data))

      except:
        self.response.out.write('<a style="font-size:1em;font-weight:bold;text-decoration:none;" href="http://www.downforeveryoneorjustme.com/reddit.com">is reddit down?</a>')
    def get(self):

        breakingstories = getAllBreakingStories("bbc")
        feed = feedparser.parse('http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/breaking_news/rss.xml')
        articles = feed['entries']

        for a in articles:
            if not headlineInList(a['title'], breakingstories):
                breakingstory = BreakingStory()
                breakingstory.source = 'bbc'
                breakingstory.url = a['link']
                breakingstory.headline = a['title']
                breakingstory.description = a['description']
                breakingstory.put()
                self.response.out.write("added story: " + a['title'] + ' (' + a['link'] + ') <br />')
示例#42
0
文件: tasks.py 项目: saga/kindledump
def fetch_feeds(request):
    feed_key = request.POST.get('feed_key', None)

    if feed_key is None:
        logging.error('missing parameter')
        raise TypeError('missing parameter')

    feed = Feed.get(feed_key)
    if feed is None:
        logging.error('Feed object not found: %s', feed_key)
        raise TypeError('Feed object not found')

    parser = feedparser.parse(feed.url)

    # check if feed exists
    if hasattr(feed, 'bozo_exception'):
        feed.is_valid = False
        logging.warn('Invalid feed: %s;;%s', feed.id, feed.url)
        feed.put()
        return

    # setup feed title if does not exist
    if not feed.title:
        feed.title = parser.feed.title

    rd = ReadyData.gql("WHERE data_type = :1 AND owner = :2 LIMIT 1",
                'feed', feed.owner).get()
    if rd is None:
        rd = ReadyData(owner=feed.owner, data_type='feed')
        rd.content = ''

    for e in parser['entries']:
        # TODO - check the date

        article = '<h1>%(title)s</h1>' % e
        for content in e['content']:
            article += content['value']

        rd.content += article
        rd.merged += 1

    rd.put()
    feed.put()

    params = {'ready_data_key': rd.key()}
    taskqueue.add(url=reverse('fetcher-send'), params=params)

    return True
def update(user, auto_save=True):
    logging.debug('Checking in for %s' % user.username)

    poster = audioscrobbler.AudioScrobblerPost(username=user.username,
                                               password=user.password,
                                               password_is_md5=True,
                                               verbose=True)
    poster.auth()

    contents = urlfetch.fetch(user.rss_url).content
    # hack around the namespacing.
    contents = contents.replace('<rhap:', '<').replace('</rhap:', '</')

    tracks_played = feedparser.parse(contents)
    for i in xrange(len(tracks_played['entries']) - 1, -1, -1):
        # skip a track we've already submitted
        # see this date ridiculousness? Bleh.
        played_at = datetime.datetime(
            *tracks_played['entries'][i]['updated_parsed'][0:6])

        if user.last_updated >= played_at:
            continue

        track = dict(artist_name=tracks_played['entries'][i]['artist'],
                     song_title=tracks_played['entries'][i]['track'],
                     length=tracks_played['entries'][i]['duration'],
                     date_played=int(
                         time.mktime(
                             tracks_played['entries'][i]['updated_parsed'])),
                     album=tracks_played['entries'][i]['album'])
        poster.add_track(**track)
        user.submitted_tracks.insert(
            0, '%s - %s' % (track['artist_name'], track['song_title']))

    # bulk submit
    num_submitted = poster.flush_cache()
    total_submitted = num_submitted + user.num_submitted
    logging.debug('\tSubmitted %d tracks (%d total)' %
                  (num_submitted, total_submitted))

    user.submitted_tracks = user.submitted_tracks[0:50]
    user.num_submitted = total_submitted

    if auto_save:
        user.put()
示例#44
0
def addFeed(feedUrl):
    """
    Function to add a new feed to the database.
    """
    try:
        feedUrl = autorss.getRSSLink(feedUrl)
        feedData = feedparser.parse(feedUrl)
    except:
        #this never occurs since parser does not raise any exceptions when invalid url is sent
        print "Invalid feed Url!"
        raise FeedError

    else:
        try:
            newFeed = Feed(url = unicode(feedUrl), title = feedData.feed.title,
                lastModified = time.mktime(feedData.modified),
                etag = unicode(feedData.etag))

            session.commit()

        except AttributeError:
            session.rollback()
            print "Error! Invalid feed URL"
            raise FeedError
        except:
            session.rollback()
            print "%s \t Feed already subscribed" % (feedData.feed.title)
            raise FeedError

        else:
            try:
                # Get the topics list and assign the feed to all the available topics.
                topicsList = Topic.query.all()
                for topic in topicsList:
                    setFeedTopic(newFeed, topic, False)
                session.commit()
                print "Added %s to all topics" % newFeed.title
            except:
                session.rollback()
                print "Error setting up topics to the Feed"
                raise FeedError

            print "Subscribed to \t %s " % (feedData.feed.title)
            fetchFeeds(newFeed, feedData)
            topicsList = Topic.query.all()
示例#45
0
def nzbs(provider=None, forcerss=False):

    feedthis = []

    def _parse_feed(site, url, verify):
        logger.fdebug('[RSS] Fetching items from ' + site)
        payload = None
        headers = {'User-Agent': str(mylar.USER_AGENT)}

        try:
            r = requests.get(url,
                             params=payload,
                             verify=verify,
                             headers=headers)
        except Exception, e:
            logger.warn('Error fetching RSS Feed Data from %s: %s' % (site, e))
            return

        feedme = feedparser.parse(r.content)

        feedthis.append({"site": site, "feed": feedme})
示例#46
0
def get_train_info(list_lines: list) -> list:
    """
        tetsudo.comのRSS情報から指定した路線の情報を取得する
        NOTE: titleは 【{{鉄道名?}}】"."join({{路線名}} みたいな感じで出るっぽいが(2020/07/05時点)パターンは要確認
              ex) 【JR九州】鹿児島本線・肥薩線・指宿枕崎線・日豊本線・日南線・吉都線・特急列車
              上記にinで検索をかけるので鉄道会社でも可能かつ曖昧検索になる
    """
    RSS_URL = "http://api.tetsudo.com/traffic/rss20.xml"
    JST = timezone(timedelta(hours=+9))
    now = datetime.now(JST)

    rss_data = feedparser.parse(RSS_URL)
    #NOTE: 前回からの実行時間の差分ではなく固定値で直近1時間を取ってるのでnowのタイミングで取れないデータが出る場合がある
    #      わざわざデータをおく場所を作りたくないのでその点は許容している
    recent_entry = filter(
        lambda entry:
        #TODO: 固定値で1 hourを引き渡してるので後でもうちょっと受け入れ幅を広げる
        now - datetime_from_rfc822(entry['published']) < timedelta(hours=1) and
        any(line in entry['title'] for line in list_lines),
        rss_data['entries'])
    for entry in recent_entry:
        #TODO: 後で消す(確認用にしばらくおいておく)
        print(entry)
        yield entry
示例#47
0
def Startit(searchName, searchIssue, searchYear, ComicVersion, IssDateFix):
    #searchName = "Uncanny Avengers"
    #searchIssue = "01"
    #searchYear = "2012"
    #clean up searchName due to webparse.
    searchName = searchName.replace("%20", " ")
    if "," in searchName:
        searchName = searchName.replace(",", "")
    logger.fdebug("name:" + str(searchName))
    logger.fdebug("issue:" + str(searchIssue))
    logger.fdebug("year:" + str(searchYear))
    splitSearch = searchName.split(" ")
    joinSearch = "+".join(splitSearch) + "+" + searchIssue
    searchIsOne = "0" + searchIssue
    searchIsTwo = "00" + searchIssue

    if "-" in searchName:
        searchName = searchName.replace("-", '((\\s)?[-:])?(\\s)?')

    regexName = searchName.replace(" ", '((\\s)?[-:])?(\\s)?')

    #logger.fdebug('searchName:' + searchName)
    #logger.fdebug('regexName:' + regexName)

    if mylar.USE_MINSIZE:
        size_constraints = "minsize=" + str(mylar.MINSIZE)
    else:
        size_constraints = "minsize=10"

    if mylar.USE_MAXSIZE:
        size_constraints = size_constraints + "&maxsize=" + str(mylar.MAXSIZE)

    if mylar.USENET_RETENTION != None:
        max_age = "&age=" + str(mylar.USENET_RETENTION)

    feeds = []
    feeds.append(
        feedparser.parse(
            "http://nzbindex.nl/rss/alt.binaries.comics.dcp/?sort=agedesc&" +
            str(size_constraints) + str(max_age) +
            "&dq=%s&max=50&more=1" % joinSearch))
    if mylar.ALTEXPERIMENTAL:
        feeds.append(
            feedparser.parse(
                "http://nzbindex.nl/rss/?dq=%s&g[]=41&g[]=510&sort=agedesc&hidespam=0&max=&more=1"
                % joinSearch))

    entries = []
    mres = {}
    tallycount = 0

    for feed in feeds:
        totNum = len(feed.entries)
        tallycount += len(feed.entries)

        #keyPair = {}
        keyPair = []
        regList = []
        countUp = 0

        logger.fdebug(str(totNum) + " results")

        while countUp < totNum:
            urlParse = feed.entries[countUp].enclosures[0]
            #keyPair[feed.entries[countUp].title] = feed.entries[countUp].link
            #keyPair[feed.entries[countUp].title] = urlParse["href"]
            keyPair.append({
                "title": feed.entries[countUp].title,
                "link": urlParse["href"],
                "length": urlParse["length"],
                "pubdate": feed.entries[countUp].updated
            })

            countUp = countUp + 1

        # thanks to SpammyHagar for spending the time in compiling these regEx's!

        regExTest = ""

        regEx = "(%s\\s*(0)?(0)?%s\\s*\\(%s\\))" % (regexName, searchIssue,
                                                    searchYear)
        regExOne = "(%s\\s*(0)?(0)?%s\\s*\\(.*?\\)\\s*\\(%s\\))" % (
            regexName, searchIssue, searchYear)

        #Sometimes comics aren't actually published the same year comicVine says - trying to adjust for these cases
        regExTwo = "(%s\\s*(0)?(0)?%s\\s*\\(%s\\))" % (regexName, searchIssue,
                                                       int(searchYear) + 1)
        regExThree = "(%s\\s*(0)?(0)?%s\\s*\\(%s\\))" % (
            regexName, searchIssue, int(searchYear) - 1)
        regExFour = "(%s\\s*(0)?(0)?%s\\s*\\(.*?\\)\\s*\\(%s\\))" % (
            regexName, searchIssue, int(searchYear) + 1)
        regExFive = "(%s\\s*(0)?(0)?%s\\s*\\(.*?\\)\\s*\\(%s\\))" % (
            regexName, searchIssue, int(searchYear) - 1)

        regexList = [
            regEx, regExOne, regExTwo, regExThree, regExFour, regExFive
        ]

        except_list = [
            'releases', 'gold line', 'distribution', '0-day', '0 day'
        ]

        for entry in keyPair:
            title = entry['title']
            #logger.fdebug("titlesplit: " + str(title.split("\"")))
            splitTitle = title.split("\"")
            noYear = 'False'

            for subs in splitTitle:
                #logger.fdebug('sub:' + subs)
                regExCount = 0
                if len(subs) > 10 and not any(d in subs.lower()
                                              for d in except_list):
                    #Looping through dictionary to run each regEx - length + regex is determined by regexList up top.
                    #                while regExCount < len(regexList):
                    #                    regExTest = re.findall(regexList[regExCount], subs, flags=re.IGNORECASE)
                    #                    regExCount = regExCount +1
                    #                    if regExTest:
                    #                        logger.fdebug(title)
                    #                        entries.append({
                    #                                  'title':   subs,
                    #                                  'link':    str(link)
                    #                                  })
                    if IssDateFix != "no":
                        if IssDateFix == "01" or IssDateFix == "02":
                            ComicYearFix = str(int(searchYear) - 1)
                        else:
                            ComicYearFix = str(int(searchYear) + 1)
                    else:
                        ComicYearFix = searchYear

                    if searchYear not in subs and ComicYearFix not in subs:
                        noYear = 'True'
                        noYearline = subs

                    if (searchYear in subs
                            or ComicYearFix in subs) and noYear == 'True':
                        #this would occur on the next check in the line, if year exists and
                        #the noYear check in the first check came back valid append it
                        subs = noYearline + ' (' + searchYear + ')'
                        noYear = 'False'

                    if noYear == 'False':

                        entries.append({
                            'title': subs,
                            'link': entry['link'],
                            'pubdate': entry['pubdate'],
                            'length': entry['length']
                        })
                        break  # break out so we don't write more shit.

#    if len(entries) >= 1:
    if tallycount >= 1:
        mres['entries'] = entries
        return mres
#       print("Title: "+regList[0])
#       print("Link: "+keyPair[regList[0]])
    else:
        logger.fdebug("No Results Found")
        return "no results"
示例#48
0
文件: rsscheck.py 项目: citrusy/mylar
def torrents(pickfeed=None, seriesname=None, issue=None):
    if pickfeed is None:
        pickfeed = 1
    #else:
    #    print "pickfeed is " + str(pickfeed)
    passkey = mylar.CBT_PASSKEY
    srchterm = None

    if seriesname:
        srchterm = re.sub(' ', '%20', seriesname)
    if issue:
        srchterm += '%20' + str(issue)

    if mylar.KAT_PROXY:
        if mylar.KAT_PROXY.endswith('/'):
            kat_url = mylar.KAT_PROXY
        else:
            kat_url = mylar.KAT_PROXY + '/'
    else:
        kat_url = 'http://kat.ph/'

    if pickfeed == "1":  # cbt rss feed based on followlist
        feed = "http://comicbt.com/rss.php?action=browse&passkey=" + str(
            passkey) + "&type=dl"
    elif pickfeed == "2" and srchterm is not None:  # kat.ph search
        feed = kat_url + "usearch/" + str(
            srchterm) + "%20category%3Acomics%20seeds%3A1/?rss=1"
    elif pickfeed == "3":  # kat.ph rss feed
        feed = kat_url + "usearch/category%3Acomics%20seeds%3A1/?rss=1"
    elif pickfeed == "4":  #cbt follow link
        feed = "http://comicbt.com/rss.php?action=follow&passkey=" + str(
            passkey) + "&type=dl"
    elif pickfeed == "5":  # cbt series link
        #       seriespage = "http://comicbt.com/series.php?passkey=" + str(passkey)
        feed = "http://comicbt.com/rss.php?action=series&series=" + str(
            seriesno) + "&passkey=" + str(passkey)
    else:
        logger.error('invalid pickfeed denoted...')
        return

    title = []
    link = []
    description = []
    seriestitle = []

    if pickfeed == "5":  # we need to get the series # first
        seriesSearch(seriespage, seriesname)

    feedme = feedparser.parse(feed)

    i = 0

    feeddata = []
    myDB = db.DBConnection()
    torthekat = []
    katinfo = {}

    for entry in feedme['entries']:
        if pickfeed == "3":
            tmpsz = feedme.entries[i].enclosures[0]
            feeddata.append({
                'Site': 'KAT',
                'Title': feedme.entries[i].title,
                'Link': tmpsz['url'],
                'Pubdate': feedme.entries[i].updated,
                'Size': tmpsz['length']
            })

        elif pickfeed == "2":
            tmpsz = feedme.entries[i].enclosures[0]
            torthekat.append({
                'site': 'KAT',
                'title': feedme.entries[i].title,
                'link': tmpsz['url'],
                'pubdate': feedme.entries[i].updated,
                'length': tmpsz['length']
            })

            #print ("Site: KAT")
            #print ("Title: " + str(feedme.entries[i].title))
            #print ("Link: " + str(tmpsz['url']))
            #print ("pubdate: " + str(feedme.entries[i].updated))
            #print ("size: " + str(tmpsz['length']))

        elif pickfeed == "1" or pickfeed == "4":
            #            tmpsz = feedme.entries[i].enclosures[0]
            feeddata.append({
                'Site': 'CBT',
                'Title': feedme.entries[i].title,
                'Link': feedme.entries[i].link,
                'Pubdate': feedme.entries[i].updated
                #                          'Size':     tmpsz['length']
            })
            #print ("Site: CBT")
            #print ("Title: " + str(feeddata[i]['Title']))
            #print ("Link: " + str(feeddata[i]['Link']))
            #print ("pubdate: " + str(feeddata[i]['Pubdate']))
        i += 1
    logger.fdebug('there were ' + str(i) + ' results..')

    if not seriesname:
        rssdbupdate(feeddata, i, 'torrent')
    else:
        katinfo['entries'] = torthekat
        return katinfo
    return
示例#49
0
文件: rsscheck.py 项目: citrusy/mylar
def nzbs(provider=None):
    nzbprovider = []
    nzbp = 0
    if mylar.NZBSU == 1:
        nzbprovider.append('nzb.su')
        nzbp += 1
    if mylar.DOGNZB == 1:
        nzbprovider.append('dognzb')
        nzbp += 1
    # --------
    #  Xperimental
    if mylar.EXPERIMENTAL == 1:
        nzbprovider.append('experimental')
        nzbp += 1

    newznabs = 0

    newznab_hosts = []

    if mylar.NEWZNAB == 1:

        for newznab_host in mylar.EXTRA_NEWZNABS:
            if newznab_host[4] == '1' or newznab_host[4] == 1:
                newznab_hosts.append(newznab_host)
                nzbprovider.append('newznab')
                newznabs += 1
                logger.fdebug('newznab name:' + str(newznab_host[0]) +
                              ' - enabled: ' + str(newznab_host[4]))

    # --------
    providercount = int(nzbp + newznabs)
    logger.fdebug('there are : ' + str(providercount) +
                  ' RSS search providers you have enabled.')
    nzbpr = providercount - 1
    if nzbpr < 0:
        nzbpr == 0

    feeddata = []
    feedthis = []
    ft = 0
    totNum = 0
    nonexp = "no"

    while (nzbpr >= 0):
        if nzbprovider[nzbpr] == 'experimental':
            feed = feedparser.parse(
                "http://nzbindex.nl/rss/alt.binaries.comics.dcp/?sort=agedesc&max=50&more=1"
            )

            totNum = len(feed.entries)
            site = 'experimental'
            keyPair = {}
            regList = []
            entries = []
            mres = {}
            countUp = 0

            i = 0
            for entry in feed['entries']:
                tmpsz = feed.entries[i].enclosures[0]
                feeddata.append({
                    'Site': site,
                    'Title': feed.entries[i].title,
                    'Link': tmpsz['url'],  #feed.entries[i].link,
                    'Pubdate': feed.entries[i].updated,
                    'Size': tmpsz['length']
                })
                #                print ("Site:" + str(site))
                #                print ("Title:" + str(feed.entries[i].title))
                #                print ("Link:" + str(feed.entries[i].link))
                #                print ("Pubdate:" + str(feed.entries[i].updated))
                #                print ("Size:" + str(tmpsz['length']))
                i += 1
            logger.info(str(i) + ' results from Experimental feed indexed.')
            nzbpr -= 1
        else:
            if nzbprovider[nzbpr] == 'newznab':
                for newznab_host in newznab_hosts:
                    if newznab_host[3] is None:
                        newznabuid = '1'
                        newznabcat = '7030'
                    else:
                        if '#' not in newznab_host[3]:
                            newznabuid = newznab_host[3]
                            newznabcat = '7030'
                        else:
                            newzst = newznab_host[3].find('#')
                            newznabuid = newznab_host[3][:newzst]
                            newznabcat = newznab_host[3][newzst + 1:]
                    feed = newznab_host[1].rstrip() + '/rss?t=' + str(
                        newznabcat) + '&dl=1&i=' + str(
                            newznabuid) + '&r=' + newznab_host[2].rstrip()
                    feedme = feedparser.parse(feed)
                    site = newznab_host[0].rstrip()
                    feedthis.append({"feed": feedme, "site": site})
                    totNum += len(feedme.entries)
                    ft += 1
                    nonexp = "yes"
                    nzbpr -= 1
            elif nzbprovider[nzbpr] == 'nzb.su':
                if mylar.NZBSU_UID is None:
                    mylar.NZBSU_UID = '1'
                feed = 'http://api.nzb.su/rss?t=7030&dl=1&i=' + mylar.NZBSU_UID + '&r=' + mylar.NZBSU_APIKEY
                feedme = feedparser.parse(feed)
                site = nzbprovider[nzbpr]
                feedthis.append({"feed": feedme, "site": site})
                totNum += len(feedme.entries)
                ft += 1
                nonexp = "yes"
                nzbpr -= 1
            elif nzbprovider[nzbpr] == 'dognzb':
                if mylar.DOGNZB_UID is None:
                    mylar.DOGNZB_UID = '1'
                feed = 'https://dognzb.cr/rss.cfm?r=' + mylar.DOGNZB_APIKEY + '&t=7030'
                feedme = feedparser.parse(feed)
                site = nzbprovider[nzbpr]
                ft += 1
                nonexp = "yes"
                feedthis.append({"feed": feedme, "site": site})
                totNum += len(feedme.entries)
                nzbpr -= 1

    i = 0
    if nonexp == "yes":
        #print str(ft) + " sites checked. There are " + str(totNum) + " entries to be updated."
        #print feedme

        for ft in feedthis:
            sitei = 0
            site = ft['site']
            logger.fdebug(str(site) + " now being updated...")
            logger.fdebug('feedthis:' + str(ft))
            for entry in ft['feed'].entries:
                if site == 'dognzb':
                    #because the rss of dog doesn't carry the enclosure item, we'll use the newznab size value
                    tmpsz = 0
                    #for attr in entry['newznab:attrib']:
                    #    if attr('@name') == 'size':
                    #        tmpsz = attr['@value']
                    #        logger.fdebug('size retrieved as ' + str(tmpsz))
                    #        break
                    feeddata.append({
                        'Site': site,
                        'Title': entry.title,  #ft['feed'].entries[i].title,
                        'Link': entry.link,  #ft['feed'].entries[i].link,
                        'Pubdate':
                        entry.updated,  #ft['feed'].entries[i].updated,
                        'Size': tmpsz
                    })
                else:
                    #this should work for all newznabs (nzb.su included)
                    #only difference is the size of the file between this and above (which is probably the same)
                    tmpsz = entry.enclosures[
                        0]  #ft['feed'].entries[i].enclosures[0]
                    feeddata.append({
                        'Site': site,
                        'Title': entry.title,  #ft['feed'].entries[i].title,
                        'Link': entry.link,  #ft['feed'].entries[i].link,
                        'Pubdate':
                        entry.updated,  #ft['feed'].entries[i].updated,
                        'Size': tmpsz['length']
                    })

                #logger.fdebug("Site: " + str(feeddata[i]['Site']))
                #logger.fdebug("Title: " + str(feeddata[i]['Title']))
                #logger.fdebug("Link: " + str(feeddata[i]['Link']))
                #logger.fdebug("pubdate: " + str(feeddata[i]['Pubdate']))
                #logger.fdebug("size: " + str(feeddata[i]['Size']))
                sitei += 1
            logger.info(str(site) + ' : ' + str(sitei) + ' entries indexed.')
            i += sitei
    logger.info(
        '[RSS] ' + str(i) +
        ' entries have been indexed and are now going to be stored for caching.'
    )
    rssdbupdate(feeddata, i, 'usenet')
    return
示例#50
0
def TDL(book=None, test=False):
    errmsg = ''
    provider = "torrentdownloads"
    host = lazylibrarian.CONFIG['TDL_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host)

    params = {"type": "search", "cid": "2", "search": book['searchterm']}
    searchURL = providerurl + "/rss.xml?%s" % urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    data, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in data:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' % (provider, data))
            errmsg = data
        data = False

    if test:
        return success

    results = []

    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    if data:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = item['title']
                    seeders = int(item['seeders'])
                    link = item['link']
                    size = int(item['size'])
                    url = None

                    if link and minimumseeders < int(seeders):
                        # no point requesting the magnet link if not enough seeders
                        # TDL gives us a relative link
                        result, success = fetchURL(providerurl + link)
                        if success:
                            new_soup = BeautifulSoup(result, 'html5lib')
                            for link in new_soup.find_all('a'):
                                output = link.get('href')
                                if output and output.startswith('magnet'):
                                    url = output
                                    break

                        if not url or not title:
                            logger.debug('Missing url or title')
                        else:
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                'magnet',
                                'priority':
                                lazylibrarian.CONFIG['TDL_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))

    return results, errmsg
示例#51
0
def LIME(book=None, test=False):
    errmsg = ''
    provider = "Limetorrent"
    host = lazylibrarian.CONFIG['LIME_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    params = {"q": book['searchterm']}
    providerurl = url_fix(host + "/searchrss/other")
    searchURL = providerurl + "?%s" % urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    data, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in data:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' % (provider, data))
            errmsg = data
        data = False

    if test:
        return success

    results = []

    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    if data:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = unaccented(item['title'])
                    try:
                        seeders = item['description']
                        seeders = int(
                            seeders.split('Seeds:')[1].split(',')[0].strip())
                    except (IndexError, ValueError):
                        seeders = 0

                    size = item['size']
                    try:
                        size = int(size)
                    except ValueError:
                        size = 0

                    url = None
                    for link in item['links']:
                        if 'x-bittorrent' in link['type']:
                            url = link['url']

                    if not url or not title:
                        logger.debug('No url or title found')
                    elif minimumseeders < int(seeders):
                        results.append({
                            'bookid':
                            book['bookid'],
                            'tor_prov':
                            provider,
                            'tor_title':
                            title,
                            'tor_url':
                            url,
                            'tor_size':
                            str(size),
                            'tor_type':
                            'torrent',
                            'priority':
                            lazylibrarian.CONFIG['LIME_DLPRIORITY']
                        })
                        logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    if 'forbidden' in str(e).lower():
                        # may have ip based access limits
                        logger.error(
                            'Access forbidden. Please wait a while before trying %s again.'
                            % provider)
                    else:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))

    return results, errmsg
示例#52
0
def ZOO(book=None, test=False):
    errmsg = ''
    provider = "zooqle"
    host = lazylibrarian.CONFIG['ZOO_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/search")

    params = {"q": book['searchterm'], "category": "books", "fmt": "rss"}
    searchURL = providerurl + "?%s" % urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    data, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in data:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' % (provider, data))
            errmsg = data
        data = False

    if test:
        return success

    results = []

    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    if data:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = unaccented(item['title'])
                    seeders = int(item['torrent_seeds'])
                    link = item['links'][1]['href']
                    size = int(item['links'][1]['length'])
                    magnet = item['torrent_magneturi']

                    url = None
                    mode = 'torrent'
                    if link:
                        url = link
                        mode = 'torrent'
                    if magnet:
                        if not url or (url and
                                       lazylibrarian.CONFIG['PREFER_MAGNET']):
                            url = magnet
                            mode = 'magnet'

                    if not url or not title:
                        logger.debug('No url or title found')
                    elif minimumseeders < int(seeders):
                        results.append({
                            'bookid':
                            book['bookid'],
                            'tor_prov':
                            provider,
                            'tor_title':
                            title,
                            'tor_url':
                            url,
                            'tor_size':
                            str(size),
                            'tor_type':
                            mode,
                            'priority':
                            lazylibrarian.CONFIG['ZOO_DLPRIORITY']
                        })
                        logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    if 'forbidden' in str(e).lower():
                        # looks like zooqle has ip based access limits
                        logger.error(
                            'Access forbidden. Please wait a while before trying %s again.'
                            % provider)
                    else:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))

    return results, errmsg
示例#53
0
def EXTRA(book=None, test=False):
    errmsg = ''
    provider = "Extratorrent"
    host = lazylibrarian.CONFIG['EXTRA_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/rss")

    params = {"type": "search", "s_cat": "2", "search": book['searchterm']}
    searchURL = providerurl + "/?%s" % urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    data, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in data:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug('Error fetching data from %s: %s' % (provider, data))
            errmsg = data
        data = False

    if test:
        return success

    results = []

    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    if data:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = unaccented(item['title'])

                    try:
                        seeders = int(item['seeders'])
                    except ValueError:
                        seeders = 0

                    try:
                        size = int(item['size'])
                    except ValueError:
                        size = 0

                    url = None
                    for link in item['links']:
                        if 'x-bittorrent' in link['type']:
                            url = link['href']

                    if not url or not title:
                        logger.debug('No url or title found')
                    elif minimumseeders < int(seeders):
                        results.append({
                            'bookid':
                            book['bookid'],
                            'tor_prov':
                            provider,
                            'tor_title':
                            title,
                            'tor_url':
                            url,
                            'tor_size':
                            str(size),
                            'tor_type':
                            'torrent',
                            'priority':
                            lazylibrarian.CONFIG['EXTRA_DLPRIORITY']
                        })
                        logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))

    return results, errmsg
示例#54
0
    def fetch(self):
        import BlogPost

        # don't fetch internally hosted blogs
        if not self.from_feed: return

        events = []

        # parse and iterate the feed
        entries = feedparser.parse(self.rss).entries
        for post in entries:
            try:
                date = dateutil.parser.parse(post.date).replace(tzinfo=None)
            except:
                date = datetime.datetime.utcnow()

            # don't re-add old posts
            if self.most_recent_date >= date:
                continue

            try:
                content = post.content[0].value
            except:
                content = post.description

            try:
                author_name = post.author_detail["name"]
            except:
                author_name = None

            # sanitize the post's content
            content = sanitize(content, [
                "h1", "h2", "h3", "h4", "h5", "h6", "a:href", "p", "ul", "ol",
                "li", "br", "div", "img:src:alt:title", "b", "i", "u",
                "strong", "em", "table", "tbody", "td", "th", "thead", "tfoot",
                "pre", "tt", "code"
            ])

            # format a summary for the post
            summary = sanitize(content, [],
                               strip_tags=[
                                   "h1", "h2", "h3", "h4", "h5", "h6", "p",
                                   "ul", "ol", "li", "br", "div", 'a', "b",
                                   "i", "u", "strong", "em", "pre", "tt",
                                   "code"
                               ])

            if len(summary) > 500:
                summary = summary[0:500] + u"..."
            summary = "<p>" + summary + "</p>"

            events.append(
                self.add_event(BlogPost.BlogPost,
                               title=post.title,
                               summary=summary,
                               from_feed=True,
                               author_name=author_name,
                               date=date,
                               extra_args={
                                   "external_link": post.link,
                                   "content": content,
                                   "blog_id": self.id
                               }))

        # find the new most recent date
        dates = [event.date for event in events if event is not None]
        dates.append(self.most_recent_date)
        self.most_recent_date = max(dates)
        self.save()
示例#55
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1].replace('gzh', 'gzhjs')
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout + 10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(
                            result.content, opener.realurl, result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content,
                                                       opener.realurl,
                                                       result.headers)
                content = content[content.index('{'):content.index('}') + 1]
                content = json.loads(content)

                for e in content['items'][:self.max_articles_per_feed]:
                    e = feedparser.parse(e)['entries'][0]
                    updated = None
                    if hasattr(e, 'lastmodified') and e.lastmodified:
                        updated = float(e.lastmodified)

                    if self.oldest_article > 0 and updated:
                        updated = datetime.datetime.utcfromtimestamp(updated)
                        delta = tnow - updated
                        if self.oldest_article > 365:
                            threshold = self.oldest_article  #以秒为单位
                        else:
                            threshold = 86400 * self.oldest_article  #以天为单位

                        if delta.days * 86400 + delta.seconds > threshold:
                            self.log.info(
                                "Skip old article(%s): %s" %
                                (updated.strftime('%Y-%m-%d %H:%M:%S'),
                                 e.href))
                            continue

                    #支持HTTPS
                    if hasattr(e, 'href'):
                        if url.startswith('https://'):
                            urlfeed = e.href.replace('http://', 'https://')
                        else:
                            urlfeed = e.href

                        if urlfeed in urladded:
                            continue
                    else:
                        urlfeed = ''

                    desc = None
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s' %
                              (result.status_code, url))

        return urls
示例#56
0
def RSS(host=None, feednr=None, priority=0, test=False):
    """
    Generic RSS query function, just return all the results from the RSS feed in a list
    """
    results = []

    URL = host
    if not str(URL)[:4] == "http":
        URL = 'http://' + URL

    result, success = fetchURL(URL)

    if test:
        return success

    if success:
        data = feedparser.parse(result)
    else:
        logger.error('Error fetching data from %s: %s' % (host, result))
        BlockProvider(host, result)
        data = None

    if data:
        # to debug because of api
        logger.debug('Parsing results from %s' % URL)
        provider = data['feed']['link']
        logger.debug("RSS %s returned %i result%s" %
                     (provider, len(data.entries), plural(len(data.entries))))
        for post in data.entries:
            title = None
            magnet = None
            size = None
            torrent = None
            nzb = None
            url = None
            tortype = 'torrent'

            if 'title' in post:
                title = post.title
            if 'links' in post:
                for f in post.links:
                    if 'x-bittorrent' in f['type']:
                        size = f['length']
                        torrent = f['href']
                        break
                    if 'x-nzb' in f['type']:
                        size = f['length']
                        nzb = f['href']
                        break

            if 'torrent_magneturi' in post:
                magnet = post.torrent_magneturi

            if torrent:
                url = torrent
                tortype = 'torrent'

            if magnet:
                if not url or (url and lazylibrarian.CONFIG['PREFER_MAGNET']):
                    url = magnet
                    tortype = 'magnet'

            if nzb:  # prefer nzb over torrent/magnet
                url = nzb
                tortype = 'nzb'

            if not url:
                if 'link' in post:
                    url = post.link

            tor_date = 'Fri, 01 Jan 1970 00:00:00 +0100'
            if 'newznab_attr' in post:
                if post.newznab_attr['name'] == 'usenetdate':
                    tor_date = post.newznab_attr['value']

            if not size:
                size = 1000
            if title and url:
                results.append({
                    'tor_prov': provider,
                    'tor_title': title,
                    'tor_url': url,
                    'tor_size': str(size),
                    'tor_date': tor_date,
                    'tor_feed': feednr,
                    'tor_type': tortype,
                    'priority': priority
                })
    else:
        logger.debug('No data returned from %s' % host)
    return results
示例#57
0
def torrents(pickfeed=None, seriesname=None, issue=None, feedinfo=None):
    if pickfeed is None:
        return

    srchterm = None

    if seriesname:
        srchterm = re.sub(' ', '%20', seriesname)
    if issue:
        srchterm += '%20' + str(issue)

    if mylar.KAT_PROXY:
        if mylar.KAT_PROXY.endswith('/'):
            kat_url = mylar.KAT_PROXY
        else:
            kat_url = mylar.KAT_PROXY + '/'
    else:
        #switched to https.
        kat_url = 'https://kat.cr/'

    if pickfeed == 'KAT':
        #we need to cycle through both categories (comics & other) - so we loop.
        loopit = 2
    else:
        loopit = 1

    lp = 0
    totalcount = 0

    title = []
    link = []
    description = []
    seriestitle = []

    feeddata = []
    myDB = db.DBConnection()
    torthekat = []
    torthe32p = []
    torinfo = {}

    while (lp < loopit):
        if lp == 0 and loopit == 2:
            pickfeed = '2'
        elif lp == 1 and loopit == 2:
            pickfeed = '5'

        feedtype = None

        if pickfeed == "1" and mylar.ENABLE_32P:  # 32pages new releases feed.
            feed = 'https://32pag.es/feeds.php?feed=torrents_all&user='******'user'] + '&auth=' + feedinfo['auth'] + '&passkey=' + feedinfo[
                    'passkey'] + '&authkey=' + feedinfo['authkey']
            feedtype = ' from the New Releases RSS Feed for comics'
            verify = bool(mylar.VERIFY_32P)
        elif pickfeed == "2" and srchterm is not None:  # kat.ph search
            feed = kat_url + "usearch/" + str(
                srchterm) + "%20category%3Acomics%20seeds%3A" + str(
                    mylar.MINSEEDS) + "/?rss=1"
            verify = bool(mylar.KAT_VERIFY)
        elif pickfeed == "3":  # kat.ph rss feed
            feed = kat_url + "usearch/category%3Acomics%20seeds%3A" + str(
                mylar.MINSEEDS) + "/?rss=1"
            feedtype = ' from the New Releases RSS Feed for comics'
            verify = bool(mylar.KAT_VERIFY)
        elif pickfeed == "4":  #32p search
            if any([
                    mylar.USERNAME_32P is None, mylar.USERNAME_32P == '',
                    mylar.PASSWORD_32P is None, mylar.PASSWORD_32P == ''
            ]):
                logger.error(
                    '[RSS] Warning - you NEED to enter in your 32P Username and Password to use this option.'
                )
                lp = +1
                continue
            if mylar.MODE_32P == 0:
                logger.warn(
                    '[32P] Searching is not available in 32p Legacy mode. Switch to Auth mode to use the search functionality.'
                )
                lp = +1
                continue
            return
        elif pickfeed == "5" and srchterm is not None:  # kat.ph search (category:other since some 0-day comics initially get thrown there until categorized)
            feed = kat_url + "usearch/" + str(
                srchterm) + "%20category%3Aother%20seeds%3A1/?rss=1"
            verify = bool(mylar.KAT_VERIFY)
        elif pickfeed == "6":  # kat.ph rss feed (category:other so that we can get them quicker if need-be)
            feed = kat_url + "usearch/.cbr%20category%3Aother%20seeds%3A" + str(
                mylar.MINSEEDS) + "/?rss=1"
            feedtype = ' from the New Releases for category Other RSS Feed that contain comics'
            verify = bool(mylar.KAT_VERIFY)
        elif int(pickfeed) >= 7 and feedinfo is not None:
            #personal 32P notification feeds.
            #get the info here
            feed = 'https://32pag.es/feeds.php?feed=' + feedinfo[
                'feed'] + '&user='******'user'] + '&auth=' + feedinfo[
                    'auth'] + '&passkey=' + feedinfo[
                        'passkey'] + '&authkey=' + feedinfo[
                            'authkey'] + '&name=' + feedinfo['feedname']
            feedtype = ' from your Personal Notification Feed : ' + feedinfo[
                'feedname']
            verify = bool(mylar.VERIFY_32P)
        else:
            logger.error('invalid pickfeed denoted...')
            return

        if pickfeed == "3" or pickfeed == "6" or pickfeed == "2" or pickfeed == "5":
            picksite = 'KAT'
        elif pickfeed == "1" or pickfeed == "4" or int(pickfeed) > 7:
            picksite = '32P'

        if pickfeed != '4':
            payload = None

            try:
                r = requests.get(feed, params=payload, verify=verify)
            except Exception, e:
                logger.warn('Error fetching RSS Feed Data from %s: %s' %
                            (picksite, e))
                return

            feedme = feedparser.parse(r.content)
            #feedme = feedparser.parse(feed)

        i = 0

        if pickfeed == '4':
            for entry in searchresults['entries']:
                justdigits = entry[
                    'file_size']  #size not available in follow-list rss feed
                seeddigits = entry[
                    'seeders']  #number of seeders not available in follow-list rss feed

                if int(seeddigits) >= int(mylar.MINSEEDS):
                    torthe32p.append({
                        'site':
                        picksite,
                        'title':
                        entry['torrent_seriesname'].lstrip() + ' ' +
                        entry['torrent_seriesvol'] + ' #' +
                        entry['torrent_seriesiss'],
                        'volume':
                        entry['torrent_seriesvol'],  # not stored by mylar yet.
                        'issue':
                        entry['torrent_seriesiss'],  # not stored by mylar yet.
                        'link':
                        entry['torrent_id'],  #just the id for the torrent
                        'pubdate':
                        entry['pubdate'],
                        'size':
                        entry['file_size'],
                        'seeders':
                        entry['seeders'],
                        'files':
                        entry['num_files']
                    })
                i += 1
        else:
            for entry in feedme['entries']:
                if any([pickfeed == "3", pickfeed == "6"]):
                    tmpsz = feedme.entries[i].enclosures[0]
                    feeddata.append({
                        'site': picksite,
                        'title': feedme.entries[i].title,
                        'link': tmpsz['url'],
                        'pubdate': feedme.entries[i].updated,
                        'size': tmpsz['length']
                    })
                elif any([pickfeed == "2", pickfeed == "5"]):
                    tmpsz = feedme.entries[i].enclosures[0]
                    torthekat.append({
                        'site': picksite,
                        'title': feedme.entries[i].title,
                        'link': tmpsz['url'],
                        'pubdate': feedme.entries[i].updated,
                        'size': tmpsz['length']
                    })
                elif pickfeed == "1" or int(pickfeed) > 7:
                    tmpdesc = feedme.entries[i].description
                    st_pub = feedme.entries[i].title.find('(')
                    st_end = feedme.entries[i].title.find(')')
                    pub = feedme.entries[i].title[
                        st_pub + 1:st_end]  # +1 to not include (
                    #logger.fdebug('publisher: ' + re.sub("'",'', pub).strip())  #publisher sometimes is given within quotes for some reason, strip 'em.
                    vol_find = feedme.entries[i].title.find('vol.')
                    series = feedme.entries[i].title[st_end +
                                                     1:vol_find].strip()
                    series = re.sub('&amp;', '&', series).strip()
                    #logger.fdebug('series title: ' + series)
                    iss_st = feedme.entries[i].title.find(' - ', vol_find)
                    vol = re.sub(
                        '\.', '',
                        feedme.entries[i].title[vol_find:iss_st]).strip()
                    #logger.fdebug('volume #: ' + str(vol))
                    issue = feedme.entries[i].title[iss_st + 3:].strip()
                    #logger.fdebug('issue # : ' + str(issue))

                    #break it down to get the Size since it's available on THIS 32P feed only so far.
                    #when it becomes available in the new feeds, this will be working, for now it just nulls out.
                    sizestart = tmpdesc.find('Size:')
                    justdigits = 0
                    if sizestart >= 0:
                        sizeend = tmpdesc.find('Leechers:')
                        sizestart += 5  # to get to the end of the word 'Size:'
                        tmpsize = tmpdesc[sizestart:sizeend].strip()
                        fdigits = re.sub("[^0123456789\.]", "",
                                         tmpsize).strip()
                        if '.' in fdigits:
                            decfind = fdigits.find('.')
                            wholenum = fdigits[:decfind]
                            decnum = fdigits[decfind + 1:]
                        else:
                            wholenum = fdigits
                        decnum = 0
                        if 'MB' in tmpsize:
                            wholebytes = int(wholenum) * 1048576
                            wholedecimal = (int(decnum) * 1048576) / 100
                            justdigits = wholebytes + wholedecimal
                        else:
                            #it's 'GB' then
                            wholebytes = (int(wholenum) * 1024) * 1048576
                            wholedecimal = (
                                (int(decnum) * 1024) * 1048576) / 100
                            justdigits = wholebytes + wholedecimal
                    #this is not currently working for 32p
                    #Get the # of seeders.
                    #seedstart = tmpdesc.find('Seeders:')
                    #seedend = tmpdesc.find('Added:')
                    #seedstart +=8  # to get to the end of the word 'Seeders:'
                    #tmpseed = tmpdesc[seedstart:seedend].strip()
                    #seeddigits = re.sub("[^0123456789\.]", "", tmpseed).strip()
                    seeddigits = 0

                    if int(mylar.MINSEEDS) >= int(seeddigits):
                        link = feedme.entries[i].link
                        linkst = link.find('&id')
                        linken = link.find('&', linkst + 1)
                        if linken == -1:
                            linken = len(link)
                        newlink = re.sub('&id=', '',
                                         link[linkst:linken]).strip()
                        feeddata.append({
                            'site':
                            picksite,
                            'title':
                            series.lstrip() + ' ' + vol + ' #' + issue,
                            'volume':
                            vol,  # not stored by mylar yet.
                            'issue':
                            issue,  # not stored by mylar yet.
                            'link':
                            newlink,  #just the id for the torrent
                            'pubdate':
                            feedme.entries[i].updated,
                            'size':
                            justdigits
                        })

                i += 1

        if feedtype is None:
            logger.info('[' + picksite + '] there were ' + str(i) +
                        ' results..')
        else:
            logger.info('[' + picksite + '] there were ' + str(i) +
                        ' results' + feedtype)

        totalcount += i
        lp += 1