示例#1
0
def scrape_release(source, feed, entry, link):
    title_text = entry.get('title')
    if not isinstance(title_text, unicode):
        title_text = title_text.encode('utf-8', 'ignore')
    title = kill_control_characters(title_text)
    date_text = (entry.get('published') or
                 entry.get('updated') or
                 entry.get('a10:updated'))
    date = dateutil.parser.parse(date_text) if date_text else now()
    body = get_link_content(link)
    if body is None:
        return

    try:
        # Does not use get_or_create because the unique constraint is just the url
        # and we don't want the source foreign key field to ever be null.
        release = Release.objects.get(url=link)
        release.title = title
        release.date = date
        release.body = body
        release.source = source
        release.save()
    except Release.DoesNotExist:
        release = Release.objects.create(url=link,
                                         source=source,
                                         title=title,
                                         date=date,
                                         body=body)
示例#2
0
 def body(self):
     if self._body is None:
         response = requests.get(self.url)
         response.raise_for_status()
         (_junk_title, body) = readability_extract(response.content)
         self._body = kill_control_characters(body)
     return self._body
示例#3
0
def get_link_content(link):
    try:
        response = requests.get(link)
        if response.status_code == 400:
            logging.warn(u"404 {}".format(link))
            return None
        if response.status_code != 200:
            raise Exception(u"Unable to fetch release content: {0}".format(link))
    except requests.exceptions.InvalidURL as e:
        logging.warn(u"Invalid link {0}: {1}".format(link, unicode(e)))
        return None

    content_type = response.headers.get('content-type')
    if not content_type:
        logging.warn(u"Response did not contain a Content-Type header: {0}".format(link))
        return None

    (mime_type, mime_subtype, mt_params) = parse_mime_type(content_type)
    if mime_type != 'text' or mime_subtype not in ('html', 'xhtml'):
        logging.warn(u"Skipping non-HTML link: {0}".format(link))
        return None

    if len(response.content) == 0:
        logging.warn(u"Server returned an empty body: {0}".format(link))
        return None

    (title, body) = readability_extract(response.content)
    return kill_control_characters(body)
示例#4
0
    def handle(self, *args, **options):
        if not hasattr(settings, "SUPERFASTMATCH"):
            raise CommandError("You must configure SUPERFASTMATCH in your project settings.")

        self.sfm = from_django_conf()

        for url in args:
            try:
                if url.startswith("http://") or url.startswith("https://"):
                    release = Release.objects.get(url=url)
                    body = get_link_content(release.url)
                    release.title = kill_control_characters(release.title)
                    release.body = body
                    release.updated = now()
                    release.save()
                    logging.info("Updated release {0}: {1}".format(release.id, release.url))
                else:
                    logging.warning("Skipping non-HTTP link {0}".format(release.url))
            except Exception as e:
                logging.error("Failed to rescrape {0}: {1}".format(url, str(e)))