示例#1
0
    def clean_fields(self, exclude=None):
        super(Feed, self).clean_fields(exclude)
        errors = {}
        parsed_feed = feedparser.parse(self.url)

        # parsed feed is an invalid feed
        # TODO add more robust error handling
        if (parsed_feed.bozo and not parsed_feed.entries):
            # try finding a feed at the site
            feeds = find_feeds(self.url)

            if (feeds):
                self.url = feeds[0]
                parsed_feed = feedparser.parse(self.url)
            else:
                urlname = self._meta.get_field('url').name
                message = "Unable to find a feed at '{0}'".format(self.url)
                errors[urlname] = ([message])
                raise ValidationError(errors)

        # if no title then use the feed's title
        if not self.title:
            self.title = parsed_feed.feed.title

        # set the source of the feed
        if parsed_feed.feed.title:
            self.source = urlparse.urlparse(parsed_feed.feed.link).hostname
        else:
            self.source = urlparse.urlparse(self.url).hostname
示例#2
0
def main():
    a = 0
    form = info()
    if request.method == 'POST':
        url = form.url.data
        feed = form.feed.data
        if (url):
            rssfeed = find_feeds(url)
        elif (feed):
            rssfeed = feed
        else:
            return render_template('main.html', form=form)
        print(rssfeed)
        t = "CREATE TABLE feedinfo(sno serial,title character varying,link character varying)"
        c.execute(t)
        rss = feedparser.parse(rssfeed[0])
        for i in range(len(rss.entries)):
            c.execute("INSERT INTO feedinfo (title,link) VALUES (%s,%s)",
                      (rss.entries[i]['title'], rss.entries[i]['link']))
        c.execute("SELECT * FROM feedinfo")
        data = c.fetchall()
        c.execute("DROP TABLE feedinfo")
        conn.commit()
        return render_template('sucess.html', data=data)
    else:
        return render_template('main.html', form=form)
示例#3
0
def get_feed(url):
    """
    Given a URL, retrieves the default feed and gets a pubsubhubbub hub
    url. Returns the hub url, self url, and parsed feed.

    :param url: Any Url for a site
    :type url: str
    :return: tuple
    """

    try:
        feed_urls = find_feeds(url)
        logging.info('Feed Urls: {0}'.format(feed_urls))
        feed_url = feed_urls[0]
        logging.info('Feed Url: {0}'.format(feed_url))
    except Exception as e:
        logging.exception(e)
        raise GetFeedError(e)

    if feed_url is None:
        raise GetFeedError('Could not find feed.')

    parsed_feed = get_parsed_feed(feed_url)

    if parsed_feed is None:
        raise GetFeedError('Could not parse feed.')

    links = get_links(parsed_feed)
    logging.info('Links: {0}'.format(links))

    if links is None:
        raise GetFeedError('Could not get links from feed.')

    return (links[0], links[1], parsed_feed)
    def __init__(self, site: str):
        self.__site = site

        # Check if that url is RSS feed like first. Otherwise, crawl and find possible feed like urls. Returns as list
        self.__getLink = self.__site if url.URL.is_feedlike_url(
            self.__site) != False else find_feeds(
                self.__site, check_all=False, user_agent=None)
        return None
示例#5
0
    def collect_feeds(self, link):

        html = self.make_request(link)
        self.extract_links(html)

        if len(self.links) == 0:
            print "process sleeping for 6 seconds"
            time.sleep(6)
            return find_feeds(link)

        else:
            return self.links
示例#6
0
文件: services.py 项目: jo-soft/jadfr
    def parse(self):
        parse_result = feedparser.parse(self.feed_url)
        if self.accept_fuzzy and FeedInformationService.parse_exception_key in parse_result:
            feeds = feedfinder2.find_feeds(self.feed_url)
            parse_result = map(feedparser.parse, feeds)
        else:
            parse_result = [parse_result]

        result = [FeedInfo(
            feed_type=parsed_result['version'],
            feed_url=parsed_result['href'],
            html_url=parsed_result['feed']['link'],
            title=parsed_result['feed']['title']) for parsed_result in parse_result]
        return result
示例#7
0
文件: api.py 项目: prodigeni/ugly
def subscribe():
    # Get the requested subscription URL.
    add_url = flask.request.values.get("url")
    if add_url is None:
        return flask.jsonify(message="You must provide a URL."), 400

    # Check to make sure that the user doesn't have too many subscriptions.
    user = _get_user()
    mx = flask.current_app.config.get("MAX_FEEDS", -1)
    if mx > 0 and len(user.feeds) >= mx:
        return flask.jsonify(message="You're already subscribed to the "
                             "maximum number of feeds."), 400

    # Try to find a feed below the requested resource.
    urls = feedfinder2.find_feeds(add_url)
    if not len(urls):
        return flask.jsonify(message="The robot can't find a feed at that "
                             "URL. Could you help it with a more specific "
                             "link?"), 400
    url = urls[0]

    # See if the user is already subscribed to a feed at that URL.
    feed = db.session.query(Feed).join(User.feeds) \
        .filter(User.id == user.id) \
        .filter(Feed.url == url).first()
    if feed is not None:
        return flask.jsonify(
            message="You've already subscribed to {0}.".format(feed.title),
            feed=feed.to_dict(),
        )

    # See if a feed object already exists for that URL.
    feed = Feed.query.filter(Feed.url == url).first()

    # If it doesn't, create a new one.
    if feed is None:
        feed = Feed(url)

        # Update the feed immediately to get the title, etc.
        feed.update_info()

    # Subscribe the current user.
    user.feeds.append(feed)
    db.session.commit()

    return flask.jsonify(
        message="Successfully subscribed to {0}.".format(feed.title),
        feed=feed.to_dict(),
    )
def loadAndFindRssUrlCandidates(inProspectiveRssHostPageUrl):
   #load URL(inProspectiveRssHostPageUrl), search for potential RSS URLs on loaded URL, and score based on link existance and etc.
   assert (type(inProspectiveRssHostPageUrl) is unicode), "loadAndFindRssUrlCandidates: inProspectiveRssHostPageUrl is not unicode"
   assert (inProspectiveRssHostPageUrl), "loadAndFindRssUrlCandidates: inProspectiveRssHostPageUrl is empty"
   scoredCandidates = []

   try:
       print 'finding feeds for: ' + inProspectiveRssHostPageUrl
       feeds = find_feeds(inProspectiveRssHostPageUrl) #returns list of RSS feeds found from input URL, through some reliable methods
       print 'found feeds for: ' + inProspectiveRssHostPageUrl
   except KeyboardInterrupt:
       f = open('no_rss_found.txt', 'a')
       f.write('Feed not found for: '+inProspectiveRssHostPageUrl.encode('ascii','ignore')+'\n')
       f.close()
       print('inProspectiveRssHostPageUrl skipped:' + inProspectiveRssHostPageUrl)
       return ""

   if not feeds: #checks if feeds list is empty, due to empty lists evaluating as false
     return ""

   assert (feeds), "loadAndFindRssUrlCandidates: list of found feeds is empty. \n Tried: " + inProspectiveRssHostPageUrl

   for rssCandidate in feeds:
      #assign score to feeds.  Use feedparser to find title and other info.  Remove points for SoundCloud, etc.

      if "itunes" not in rssCandidate:
         parseRss = feedparser.parse(rssCandidate)
         score = 0
         rssKeyWords= ["feed", "rss", "atom", "xml"]

         if any(x in rssCandidate for x in rssKeyWords):
            score += 1

            if parseRss.version != "":
               score += 1

               if "soundcloud" not in rssCandidate:
                 score += 1

      scoredCandidates.append((rssCandidate, score))

   assert (scoredCandidates), "loadAndFindRssUrlCandidates: scoredCandidates is empty"
   assert (type(scoredCandidates[0]) is tuple), "loadAndFindRssUrlCandidates: scoredCandidates is not list of tuples" + scoredCandidates
   return scoredCandidates
示例#9
0
def store(request):
    url = request.json.get('url')
    category = request.json.get('category')
    app_id = request.json.get('app_id')
    existing_feed = Feed.query.filter(Feed.url.ilike(
        r"%{}%".format(url))).first()
    if existing_feed:
        return jsonify({'error': 'Feed already added.'})
    url = url.strip()
    feed = feedparser.parse(url)
    if not feed.entries:
        feed_urls = find_feeds(url)
        if feed_urls:
            feed = feedparser.parse(feed_urls[0])
            feed_url = feed_urls[0]
        else:
            return jsonify({'error': 'Invalid feed url'})
    else:
        feed_url = url

    title = feed.feed.title
    if 'icon' in feed.feed:
        icon = feed.feed.icon
    elif 'image' in feed.feed:
        icon = feed.feed.image.href
    else:
        icon = None
    ext = tldextract.extract(url)
    domain_name = ext.domain + '.' + ext.suffix
    newFeed = Feed(id=helpers.generate_unique_code(),
                   title=title,
                   url=url,
                   alexa_rank=get_alexa_rank(domain_name),
                   domain=domain_name,
                   app_id=app_id,
                   rss_url=feed_url,
                   icon=icon)
    db.session.add(newFeed)
    if category:
        category_data = add_or_fetch_category(category)
        newFeed.categories.append(category_data)
    db.session.commit()

    return respondWithItem(newFeed)
示例#10
0
def subscribe():
    # Get the requested subscription URL.
    add_url = flask.request.values.get("url")
    if add_url is None:
        return flask.jsonify(message="You must provide a URL."), 400

    # Check to make sure that the user doesn't have too many subscriptions.
    user = _get_user()
    mx = flask.current_app.config.get("MAX_FEEDS", -1)
    if mx > 0 and len(user.feeds) >= mx:
        return flask.jsonify(message="You're already subscribed to the " "maximum number of feeds."), 400

    # Try to find a feed below the requested resource.
    urls = feedfinder2.find_feeds(add_url)
    if not len(urls):
        return (
            flask.jsonify(
                message="The robot can't find a feed at that " "URL. Could you help it with a more specific " "link?"
            ),
            400,
        )
    url = urls[0]

    # See if the user is already subscribed to a feed at that URL.
    feed = db.session.query(Feed).join(User.feeds).filter(User.id == user.id).filter(Feed.url == url).first()
    if feed is not None:
        return flask.jsonify(message="You've already subscribed to {0}.".format(feed.title), feed=feed.to_dict())

    # See if a feed object already exists for that URL.
    feed = Feed.query.filter(Feed.url == url).first()

    # If it doesn't, create a new one.
    if feed is None:
        feed = Feed(url)

        # Update the feed immediately to get the title, etc.
        feed.update_info()

    # Subscribe the current user.
    user.feeds.append(feed)
    db.session.commit()

    return flask.jsonify(message="Successfully subscribed to {0}.".format(feed.title), feed=feed.to_dict())
示例#11
0
from pprint import pprint

from feedfinder2 import find_feeds

from feedsearch import search

# url = 'http://nymag.com/newyork/rss/'
url = 'http://arstechnica.com'

start1 = time.perf_counter()

result1 = search(url, info=True, check_all=False)

time1 = int((time.perf_counter() - start1) * 1000)

start2 = time.perf_counter()

result2 = find_feeds(url, check_all=False)

time2 = int((time.perf_counter() - start2) * 1000)

print()
print(f'Feedsearch searched url in {time1}ms: Found: {result1}')
print(f'Feedfinder2 searched url in {time2}ms: Found: {result2}')

print()
for r in result1:
    pprint(vars(r))
print()
pprint(result2)
示例#12
0
import asyncio
from feedfinder2 import find_feeds

url = "xkcd.com"

loop = asyncio.get_event_loop()
task = asyncio.ensure_future(find_feeds(url))
feeds = loop.run_until_complete(task)

print(feeds)
示例#13
0
    if 'base_urls' not in sitelist.keys():
        raise ValueError(
            "HJSON file doesn't have sitelist key. Check News Please input format."
        )

    sitelist = sitelist['base_urls']

    for site in sitelist:
        if 'url' not in site:
            continue

        if site['url'] in blocked_sites:
            continue

        rss_links = find_feeds(site['url'])

        if len(rss_links) == 0:
            continue

        feed = feedparser.parse(rss_links[0])

        if 'entries' not in feed:
            continue

        print("Processing feed for", site['url'])

        for entry in tqdm(feed['entries']):
            if 'link' not in entry:
                continue
示例#14
0
# from feedfinder2_upgraded import find_feeds
from feedfinder2 import find_feeds
from pprint import pprint

# url = "http://boingboing.net"
url = "http://localhost:5000/author/1"

feeds = find_feeds(url)
pprint(feeds)
示例#15
0
from bs4 import BeautifulSoup as bs4
import urllib3
import requests
import feedparser
import urllib.parse
from feedfinder2 import find_feeds

pag = "https://www.cooperativa.cl/noticias/pais/region-de-nuble/este-viernes-se-inicio-el-juicio-oral-por-fraude-al-fisco-en-chillan/2019-06-07/075336.html"
feeds = find_feeds(pag)

print(feeds)
'''
def findfeed(site):
    raw = requests.get(site).text
    result = []
    possible_feeds = []
    html = bs4(raw, 'lxml')
    feed_urls = html.findAll("link", rel="alternate")
    if len(feed_urls) > 1:
        for f in feed_urls:
            t = f.get("type",None)
            if t:
                if "rss" in t or "xml" in t:
                    href = f.get("href",None)
                    if href:
                        possible_feeds.append(href)
    parsed_url = urllib.parse.urlparse(site)
    base = parsed_url.scheme+"://"+parsed_url.hostname
    atags = html.findAll("a")
    for a in atags:
        href = a.get("href",None)