Python normalize_url примеры, util.normalize_url Python примеры использования

Пример #1

0

Показать файл

 def svnauth(self):
     # DO NOT default the user to hg's getuser(). If you provide
     # *any* default username to Subversion, it won't use any remembered
     # username for the desired realm, breaking OS X Keychain support,
     # GNOME keyring support, and all similar tools.
     user = self.ui.config('hgsubversion', 'username')
     passwd = self.ui.config('hgsubversion', 'password')
     url = util.normalize_url(self.path)
     user, passwd, url = svnwrap.parse_url(url, user, passwd)
     return url, user, passwd

Пример #2

0

Показать файл

Файл: svnrepo.py Проект: chaptastic/config_files

 def svnauth(self):
     # DO NOT default the user to hg's getuser(). If you provide
     # *any* default username to Subversion, it won't use any remembered
     # username for the desired realm, breaking OS X Keychain support,
     # GNOME keyring support, and all similar tools.
     user = self.ui.config('hgsubversion', 'username')
     passwd = self.ui.config('hgsubversion', 'password')
     url = util.normalize_url(self.path)
     user, passwd, url = svnwrap.parse_url(url, user, passwd)
     return url, user, passwd

Пример #3

0

Показать файл

Файл: scraper.py Проект: avorio/opp-tools

def check_steppingstone(page):
    debug(3, "checking: intermediate page leading to article?")

    # steppingstone pages from known repositories:
    redir_patterns = [
        # arxiv.org, springer.com, researchgate, etc.:
        (re.compile('<meta name="citation_pdf_url" content="(.+?)"'),
        (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))),
        # philpapers.org:
        (re.compile('class=\'outLink\' href="http://philpapers.org/go.pl[^"]+u=(http.+?)"'),
        (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))),
        # philsci-archive.pitt.edu:
        (re.compile('<meta name="eprints.document_url" content="(.+?)"'),
        (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))),
        # sciencedirect.com:
        (re.compile('pdfurl="(.+?)"'),
        (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))),
        # PLOSOne:
        (re.compile('(http://www.plosone.org/article/.+?representation=PDF)" id="downloadPdf"'),
        (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))),
        # Google Drive:
        (re.compile('content="https://drive.google.com/file/d/(.+?)/'),
        (lambda m: 'https://googledrive.com/host/{}'.format(requests.utils.unquote(m.group(1)))))
    ]
    for (pattern, retr_target) in redir_patterns:
        m = pattern.search(page.html)
        if m:
            target = util.normalize_url(retr_target(m))
            if target == page.url:
                return None
            debug(3, "yes: repository page for %s", target)
            return target
    
    # other steppingstone pages must have link(s) to a single pdf file:
    targets = set(u for u in page.xpath('//a/@href') if re.search('.pdf$', u, re.I))
    if len(targets) != 1:
        debug(3, "no: %s links to pdf files", len(targets))
        return None
    debug(3, "yes: single link to pdf file %s", targets[0])
    target = util.normalize_url(page.make_absolute(targets[0]))
    return target

Пример #4

0

Показать файл

Файл: __init__.py Проект: lucasguillermo/opp-tools

def source():
    url = normalize_url(request.args.get('url'))
    db = get_db()
    cur = db.cursor(MySQLdb.cursors.DictCursor)
    query = "SELECT * FROM sources WHERE url = %s"
    cur.execute(query, (url,))
    app.logger.debug(cur._last_executed)
    sources = cur.fetchall()
    if not sources:
        return jsonify({ 'msg': 'there seems to be no source with that url' })
    source = sources[0]
    return jsonify({ 'msg': 'OK', 'source': source })

Пример #5

0

Показать файл

def updateexternals(ui, args, repo, **opts):
    """update repository externals
    """
    if len(args) > 2:
        raise hgutil.Abort(_('updateexternals expects at most one changeset'))
    node = None
    if len(args) == 2:
        svnurl = util.normalize_url(repo.ui.expandpath(args[0]))
        args = args[1:]
    else:
        svnurl = util.normalize_url(repo.ui.expandpath('default'))
    if args:
        node = args[0]

    svnroot = getsvninfo(svnurl)[1]

    # Retrieve current externals status
    try:
        oldext = file(repo.join('svn/externals'), 'rb').read()
    except IOError:
        oldext = ''
    newext = ''
    ctx = repo[node]
    if '.hgsvnexternals' in ctx:
        newext = ctx['.hgsvnexternals'].data()

    updater = externalsupdater(ui, repo)
    actions = computeactions(ui, repo, svnroot, oldext, newext)
    for action, ext in actions:
        if action == 'u':
            updater.update(ext[0], ext[1], ext[2], ext[3])
        elif action == 'd':
            updater.delete(ext[0])
        else:
            raise hgutil.Abort(_('unknown update actions: %r') % action)

    file(repo.join('svn/externals'), 'wb').write(newext)

Пример #6

0

Показать файл

Файл: svnexternals.py Проект: pnkfelix/ConfigFiles

def updateexternals(ui, args, repo, **opts):
    """update repository externals
    """
    if len(args) > 2:
        raise hgutil.Abort(_('updateexternals expects at most one changeset'))
    node = None
    if len(args) == 2:
        svnurl = util.normalize_url(repo.ui.expandpath(args[0]))
        args = args[1:]
    else:
        svnurl = util.normalize_url(repo.ui.expandpath('default'))
    if args:
        node = args[0]

    svnroot = getsvninfo(svnurl)[1]

    # Retrieve current externals status
    try:
        oldext = file(repo.join('svn/externals'), 'rb').read()
    except IOError:
        oldext = ''
    newext = ''
    ctx = repo[node]
    if '.hgsvnexternals' in ctx:
        newext = ctx['.hgsvnexternals'].data()

    updater = externalsupdater(ui, repo)
    actions = computeactions(ui, repo, svnroot, oldext, newext)
    for action, ext in actions:
        if action == 'u':
            updater.update(ext[0], ext[1], ext[2], ext[3])
        elif action == 'd':
            updater.delete(ext[0])
        else:
            raise hgutil.Abort(_('unknown update actions: %r') % action)

    file(repo.join('svn/externals'), 'wb').write(newext)

Пример #7

0

Показать файл

Файл: svnexternals.py Проект: erikvanzijst/svnimporter

 def get(self, state, *args, **kwargs):
     # Resolve source first
     line = state[0].split(':', 1)[1]
     source, pegrev = parsedefinition(line)[2:4]
     try:
         # Getting the root SVN repository URL is expensive.
         # Assume the externals is absolute.
         source = resolvesource(self._ui, None, source)
     except RelativeSourceError:
         svnurl = self._ctx._repo.ui.expandpath('default')
         svnroot = getsvninfo(util.normalize_url(svnurl))[1]
         source = resolvesource(self._ui, svnroot, source)
     if pegrev is not None:
         source = source + '@' + pegrev
     state = (source, state[1])
     # hg-1.7.4-c19b9282d3a7 introduced the overwrite argument
     return super(svnsubrepo, self).get(state, *args, **kwargs)

Пример #8

0

Показать файл

Файл: svnexternals.py Проект: pnkfelix/ConfigFiles

 def get(self, state, *args, **kwargs):
     # Resolve source first
     line = state[0].split(':', 1)[1]
     source, pegrev = parsedefinition(line)[2:4]
     try:
         # Getting the root SVN repository URL is expensive.
         # Assume the externals is absolute.
         source = resolvesource(self._ui, None, source)
     except RelativeSourceError:
         svnurl = self._ctx._repo.ui.expandpath('default')
         svnroot = getsvninfo(util.normalize_url(svnurl))[1]
         source = resolvesource(self._ui, svnroot, source)
     if pegrev is not None:
         source = source + '@' + pegrev
     state = (source, state[1])
     # hg-1.7.4-c19b9282d3a7 introduced the overwrite argument
     return super(svnsubrepo, self).get(state, *args, **kwargs)

Пример #9

0

Показать файл

 def get(self, state, *args, **kwargs):
     # Resolve source first
     line = state[0].split(':', 1)[1]
     source, pegrev = parsedefinition(line)[2:4]
     try:
         # Getting the root SVN repository URL is expensive.
         # Assume the externals is absolute.
         source = resolvesource(self.ui, None, source)
     except RelativeSourceError:
         svnurl = self._ctx._repo.ui.expandpath('default')
         svnroot = getsvninfo(util.normalize_url(svnurl))[1]
         source = resolvesource(self.ui, svnroot, source)
     # hg 1.9 and higher, append the rev as a peg revision to
     # the source URL, so we cannot add our own. We assume
     # that "-r10 url@2" will be similar to "url@10" most of
     # the time.
     state = (source, state[1])
     return super(svnsubrepo, self).get(state, *args, **kwargs)

Пример #10

0

Показать файл

Файл: svnexternals.py Проект: seewindcn/tortoisehg

 def get(self, state, *args, **kwargs):
     # Resolve source first
     line = state[0].split(':', 1)[1]
     source, pegrev = parsedefinition(line)[2:4]
     try:
         # Getting the root SVN repository URL is expensive.
         # Assume the externals is absolute.
         source = resolvesource(self.ui, None, source)
     except RelativeSourceError:
         svnurl = self._ctx._repo.ui.expandpath('default')
         svnroot = getsvninfo(util.normalize_url(svnurl))[1]
         source = resolvesource(self.ui, svnroot, source)
     # hg 1.9 and higher, append the rev as a peg revision to
     # the source URL, so we cannot add our own. We assume
     # that "-r10 url@2" will be similar to "url@10" most of
     # the time.
     state = (source, state[1])
     return super(svnsubrepo, self).get(state, *args, **kwargs)

Пример #11

0

Показать файл

Файл: sourcesfinder.py Проект: lucasguillermo/opp-tools

 def find_new_pages(self, name):
     # searches for papers pages matching author name, returns urls of new pages
     pages = set()
     search_terms = [
         # careful with google.com: don't block sites.google.com...
         '-site:academia.edu',
         '-site:wikipedia.org',
         '-site:philpapers.org',
         '-filetype:pdf',
         '~philosophy',
         '(publications OR articles OR papers OR "in progress" OR forthcoming)',
     ]
     # search full name first, then last name only:
     search_phrase = u'"{}" '.format(name) + ' '.join(search_terms),
     searchresults = set(self.websearch(search_phrase))
     search_phrase = u'"{}" '.format(name.split()[-1]) + ' '.join(search_terms)
     searchresults |= set(self.websearch(search_phrase))
     for url in searchresults:
         logger.debug("\n")
         url = normalize_url(url) 
         # check if url already known:
         cur = self.get_db().cursor()
         cur.execute("SELECT 1 FROM sources WHERE url = %s", (url,))
         rows = cur.fetchall()
         if rows:
             logger.info(u"%s already known", url)
             continue
         try:
             r = self.fetch(url)
         except:
             logger.info(u"cannot retrieve %s", url)
         else:
             score = self.evaluate(r, name)
             if score < 0.7:
                 logger.info(u"%s doesn't look like a papers page", url)
                 continue
             dupe = self.is_duplicate(url)
             if dupe:
                 logger.info(u"%s is a duplicate of already known %s", url, dupe)
                 continue
             pages.add(url)
     return pages

Пример #12

0

Показать файл

Файл: svnexternals.py Проект: avuori/dotfiles

 def get(self, state, *args, **kwargs):
     # Resolve source first
     line = state[0].split(':', 1)[1]
     source, pegrev = parsedefinition(line)[2:4]
     try:
         # Getting the root SVN repository URL is expensive.
         # Assume the externals is absolute.
         source = resolvesource(self._ui, None, source)
     except RelativeSourceError:
         svnurl = self._ctx._repo.ui.expandpath('default')
         svnroot = getsvninfo(util.normalize_url(svnurl))[1]
         source = resolvesource(self._ui, svnroot, source)
     # hg < 1.9 svnsubrepo calls "svn checkout" with --rev
     # only, so peg revisions are correctly used. 1.9 and
     # higher, append the rev as a peg revision to the source
     # URL, so we cannot add our own. We assume that "-r10
     # url@2" will be similar to "url@10" most of the time.
     if pegrev is not None and passpegrev:
         source = source + '@' + pegrev
     state = (source, state[1])
     # hg-1.7.4-c19b9282d3a7 introduced the overwrite argument
     return super(svnsubrepo, self).get(state, *args, **kwargs)

Пример #13

0

Показать файл

Файл: downloader.py Проект: wardrich/ss-plex.bundle

def status_file_for(endpoint):
    import os.path, tempfile
    tmpdir = tempfile.gettempdir()
    status = util.normalize_url(endpoint)
    return os.path.join(tmpdir, status)

Пример #14

0

Показать файл

Файл: __init__.py Проект: lucasguillermo/opp-tools

def editsource():
    source_id = int(request.form['source_id'])
    status = request.form['status']
    source_type = request.form['type']
    url = normalize_url(request.form['url'])
    default_author = request.form['default_author']
    source_name = request.form['name']
    db = get_db()
    cur = db.cursor()
    
    if request.form['submit'] == 'Remove Source':
        if source_type == '3':
            # remove blog subscription on superfeedr:
            from superscription import Superscription
            ss = Superscription(config('SUPERFEEDR_USER'), password=config('SUPERFEEDR_PASSWORD'))
            success = False
            try:
                app.logger.debug('removing {} from superfeedr'.format(url))
                success = ss.unsubscribe(hub_topic=url)
            except:
                pass
            if not success:
                msg = 'could not unsubscribe blog from superfeedr!'
                if ss.response.status_code:
                    msg += ' status {}'.format(ss.response.status_code)
                else:
                    msg += ' no response from superfeedr server'
                return jsonify({'msg':msg})
        query = "DELETE FROM sources WHERE source_id = %s"
        cur.execute(query, (source_id,))
        app.logger.debug(cur._last_executed)
        db.commit()
        return jsonify({'msg':'OK'})

    else:
        if source_id != 0:
            # Is it sensible to change the url of an existing source
            # page? Not really if an author has moved their site with
            # all their documents, because then all the links and
            # document URLs are also new. But sometimes the url
            # changes and all the links remain the same, e.g. when
            # Kent Bach's homepage moved from /~kbach to /kbach.
            query = "UPDATE sources set url=%s, status=%s, type=%s, default_author=%s, name=%s WHERE source_id=%s"
            cur.execute(query, (url,status,source_type,default_author,source_name,source_id))
            app.logger.debug(cur._last_executed)
            db.commit()
        else:
            query = '''INSERT INTO sources (url, status, type, default_author, name)
                       VALUES(%s, %s, %s, %s, %s)
                       ON DUPLICATE KEY UPDATE status=%s, type=%s, default_author=%s, name=%s, source_id=LAST_INSERT_ID(source_id)'''
            cur.execute(query, (url,status,source_type,default_author,source_name,status,source_type,default_author,source_name))
            app.logger.debug(cur._last_executed)
            db.commit()
            insert_id = cur.lastrowid

            if source_type == '3':
                # register new blog subscription on superfeedr:
                from superscription import Superscription
                ss = Superscription(config('SUPERFEEDR_USER'), password=config('SUPERFEEDR_PASSWORD'))
                msg = 'could not register blog on superfeedr!'
                try:
                    callback=request.url_root+'new_blog_post/{}'.format(insert_id)
                    app.logger.debug('suscribing to {} on {} via superfeedr'.format(url,callback))
                    success = ss.subscribe(hub_topic=url, hub_callback=callback)
                    if success:
                        return jsonify({'msg':'OK'})
                except:
                    if ss.response.status_code:
                        msg += ' status {}'.format(ss.response.status_code)
                    else:
                        msg += ' no response from superseedr server'
                return jsonify({'msg':msg})
        return jsonify({'msg':'OK'})

Пример #15

0

Показать файл

Файл: scraper.py Проект: avorio/opp-tools

def scrape(source, keep_tempfiles=False):
    """
    Look for new papers linked to on the source page (and check for
    revisions to known papers).     

    Issues to keep in mind:
    
    (1) Links on personal pages often lead to old papers that have
    been published long ago. (That's true even for newly added links,
    when people upload older papers.) We don't want to list these
    papers in the news feed, nor do we need to check them for
    revisions. So if we find a link to an old and published paper, we
    treat it like a link to a non-paper. (If a manuscript changes into
    a published paper, we keep the paper in the database because it
    still ought to show up as "new papers found on x/y/z" and because
    it might have been used to train personal filters, but we remove
    the doc_id from the link, thereby marking the link as known but
    irrelevant.)

    (2) Sometimes links to papers are temporarily broken, or
    there's a temporary problem with the metadata extraction. So
    if we encounter an error while processing a (promising) new
    link, we try again once or twice in the course of the next
    week (using link.found_date).

    (3) To check for revisions of existing manuscripts (and, more
    unusually, new papers appearing at an old url), we have to
    occasionally re-process known links. But we don't want to re-parse
    all documents all the time. Instead, we select a few old papers
    (i.e., links with an associated doc_id that are still on the page,
    ordered by last_checked).

    (4) We could remove all links from the db that are no longer on
    the page, but it's probably not worth the effort. Moreover, pages
    are sometimes temporarily replaced by "under maintanance" pages
    (for example), and then we may not want to re-process all links
    once the page comes back. So we simply ignore disappeared links:
    they remain in the db, but they are never revisited until they
    reappear on the page.
    
    (5) If a page is processed for the first time (status==0 in the
    db), we don't want to display all linked papers in the news
    feed. Nonetheless, we process all links so that we can check for
    revisions (think of the Stanford Encyclopedia). To avoid
    displaying the papers as new, we mark them with a found_date of
    1970.
    """

    debug(1, "checking links on %s", source.url)

    # go to page:
    browser = Browser(use_virtual_display=True)
    try:
        browser.goto(source.url)
    except Exception as e:
        logger.warning('connection to source %s failed: %s', source.url, str(e))
        source.update_db(status=error.code['connection failed'])
        return 0

    if browser.current_url != source.url:
        # redirects of journal pages are OK (e.g. from /current to
        # /nov-2015), but redirects of personal papers pages are often
        # caused by pages having disappeared; the redirect can then
        # take us e.g. to CMU's general document archive; we don't
        # want that. So here we wait for manual approval of the new
        # url.
        if source.sourcetype == 'personal':
            logger.warning('%s redirects to %s', source.url, browser.current_url)
            source.update_db(status=301)
            return 0
        else:
            debug(2, '%s redirected to %s', source.url, browser.current_url)

    # look for new links:
    source.set_html(browser.page_source)
    new_links = {} # url => Link
    old_links = {} # url => Link
    for li in browser.find_elements_by_tag_name("a"):
        if not li.is_displayed() or not li.get_attribute('href'):
            continue
        href = li.get_attribute('href')
        if is_bad_url(href):
            debug(3, 'ignoring link to %s (bad url)', href)
            continue
        href = util.normalize_url(source.make_absolute(href))
        old_link = source.old_link(href)
        if old_link:
            debug(3, 'link to %s is old: %s', href, old_link.url)
            old_links[href] = old_link
            old_links[href].element = li
        else:
            debug(1, 'new link: "%s" %s', li.text, href)
            new_links[href] = Link(url=href, source=source, element=li)
    
    # Selenium doesn't tell us when a site yields a 404, 401, 500
    # etc. error. But we can usually tell from the fact that there are
    # few known links on the error page:
    debug(1, 'status {}, old links: {}'.format(source.status, len(old_links.keys())))
    if source.status > 0 and len(old_links.keys()) <= 1:
        debug(1, 'suspiciously few old links, checking status code')
        status, r = util.request_url(source.url)
        if status != 200:
            logger.warning('error %s at source %s', status, source.url)
            source.update_db(status=status)
            return 0

    source.update_db(status=1)
    
    # process new links:
    if new_links:
        for li in new_links.values():
            debug(1, '\nprocessing new link to %s', li.url)
            process_link(li)
            # for testing: one link only
            # return 1
    else:
        debug(1, "no new links")

    # re-process recently found old links that generated errors:
    for li in old_links.values():
        if li.status > 9:
            tdelta = datetime.now() - li.found_date
            if tdelta.days < 5:
                debug(1, 're-checking recent link %s with status %s', li.url, li.status)
                process_link(li, force_reprocess=True)
    
    # re-check old links to papers for revisions:
    MAX_REVCHECK = 3
    goodlinks = [li for li in old_links.values() if li.doc_id]
    for li in sorted(goodlinks, key=lambda x:x.last_checked)[:MAX_REVCHECK]:
        debug(1, 're-checking old link to paper %s for revisions', li.url)
        process_link(li)

    if not keep_tempfiles:
        remove_tempdir()

Пример #16

0

Показать файл

Файл: scraper.py Проект: avorio/opp-tools

def process_link (li, force_reprocess=False, redir_url=None, keep_tempfiles=False,
                 recurse=0):
    """
    Fetch url, check for http errors and steppingstones, filter spam,
    parse candidate papers, check for duplicates, check if published
    before last year.

    Links often lead to intermediate pages (e.g. on repositories) with
    another link to the actual paper. In this case, we only store the
    original link in the 'links' table, so the 'doc' entry has a url
    that doesn't match any link. To process the new link, process_link
    is called again, with redir_url set to the new url and recurse +=
    1.

    If force_reprocess is False and the link has already been checked
    at some point, if_modified_since and etag headers are sent.
    """

    # ignore links to old and published papers:
    li.context = li.html_context()
    debug(2, "link context: %s", li.context)
    if context_suggests_published(li.context):
        li.update_db(status=1, doc_id=None)
        return 0
    
    # fetch url and handle errors, redirects, etc.:
    url = redir_url or li.url
    r = li.fetch(url=url, only_if_modified=not(force_reprocess))
    if not r: 
        return 0
        
    if r.url != url: # redirected
        url = util.normalize_url(r.url)
        # now we treat li as if it directly led to the redirected document

    if r.filetype not in ('html', 'pdf', 'doc', 'rtf'):
        li.update_db(status=error.code['unsupported filetype'])
        return debug(1, "unsupported filetype: %s", r.filetype)

    doc = Doc(url=url, r=r, link=li, source=li.source)
    
    if r.filetype == 'html':
        r.encoding = 'utf-8'
        doc.page = Webpage(url, html=r.text)
        debug(6, "\n====== %s ======\n%s\n======\n", url, r.text)

        # check for steppingstone pages with link to a paper:
        target_url = check_steppingstone(doc.page)
        if target_url and recurse < 3:
            debug(1, "steppingstone to %s", target_url)
            return process_link(li, redir_url=target_url, 
                                force_reprocess=force_reprocess, recurse=recurse+1)

        # Genuine papers are almost never in HTML format, and almost
        # every HTML page is not a paper. The few exceptions (such as
        # entries on SEP) tend to require special parsing. Hence the
        # following special treatment. If people start posting
        # articles on medium or in plain HTML, we might return to the
        # old procedure of converting the page to pdf and treating it
        # like any candidate paper.
        doc.content = doc.page.text()
        doc.numwords = len(doc.content.split())
        doc.numpages = 1
        import docparser.webpageparser as htmlparser
        if not htmlparser.parse(doc):
            debug(1, "page ignored")
            li.update_db(status=1)
            return 0

    else:
        # save as pdf:
        try:
            doc.tempfile = save_local(r)
        except:
            return li.update_db(status=error.code['cannot save local file'])
        if r.filetype != 'pdf':
            try:
                doc.tempfile = convert_to_pdf(doc.tempfile)
            except:
                debug(1, 'pdf conversion failed!')
                return li.update_db(status=error.code['pdf conversion failed'])
        try:
            pdfmeta = pdfinfo(doc.tempfile)
            doc.numpages = int(pdfmeta['Pages'])
        except:
            debug(1, 'pdfinfo failed!')
            return li.update_db(status=error.code['pdfinfo failed'])
        debug(2, 'pdf has %s pages', doc.numpages)

        # convert to xml:
        doc.xmlfile = doc.tempfile.rsplit('.')[0] + '.xml'
        if doc.numpages > 10:
            # ocr only first 7 + last 3 pages if necessary:
            ocr_ranges = [(1,7), (doc.numpages-2,doc.numpages)]
        else:
            ocr_ranges = None
        try:
            engine = pdf2xml(doc.tempfile, doc.xmlfile, 
                             keep_tempfiles=keep_tempfiles,
                             ocr_ranges=ocr_ranges)
        except Exception as e:
            debug(1, "converting pdf to xml failed: %s", e)
            return li.update_db(status=error.code['pdf conversion failed'])
        doc.content = util.strip_xml(readfile(doc.xmlfile))
        debug(5, "text content:\n%s", doc.content)
        if engine == 'pdftohtml':
            doc.numwords = len(doc.content.split())
        else:
            doc.ocr = True
            if doc.numpages > 10:
                # extrapolate numwords from numpages and the number of words
                # on the ocr'ed pages:
                doc.numwords = len(doc.content.split()) * doc.numpages/10
            else:
                doc.numwords = len(doc.content.split())

        # guess doc type (paper, book, review, etc.):
        import doctyper.doctyper as doctyper
        doc.doctype = doctyper.evaluate(doc)

        # extract metadata:
        import docparser.paperparser as paperparser
        if not paperparser.parse(doc, keep_tempfiles=keep_tempfiles):
            logger.warning("metadata extraction failed for %s", url)
            li.update_db(status=error.code['parser error'])
            return 0
            
        # estimate whether doc is not a handout, cv etc.:
        import doctyper.paperfilter as paperfilter
        paperprob = paperfilter.evaluate(doc)
        doc.is_paper = int(paperprob * 100)
        if doc.is_paper < 50:
            li.update_db(status=1)
            debug(1, "spam: paper score %s < 50", doc.is_paper)
            return 0
        
        # estimate whether doc is on philosophy:
        import doctyper.philosophyfilter as philosophyfilter
        try:
            philprob = philosophyfilter.evaluate(doc)
        except UntrainedClassifierException as e:
            philprob = 0.9
        doc.is_philosophy = int(philprob * 100)        
        if doc.is_philosophy < 50:
            li.update_db(status=1)
            debug(1, "spam: philosophy score %s < 50", doc.is_philosophy)
            return 0

        # TODO: classify for main topics?
            
    if li.doc_id:
        # check for revisions:
        olddoc = Doc(doc_id=li.doc_id)
        olddoc.load_from_db()
        if doc.content != olddoc.content:
            sm = SequenceMatcher(None, doc.content, olddoc.content)
            match_ratio = sm.ratio()
            if match_ratio < 0.8:
                debug(1, "substantive revisions, ratio %s", match_ratio)
                doc.earlier_id = olddoc.doc_id
        if not doc.earlier_id:
            li.update_db(status=1)
            debug(1, "no substantive revisions")
            return 0
    
    else:
        # check for duplicates:
        dupe = get_duplicate(doc)
        if dupe:
            debug(1, "duplicate of document %s", dupe.doc_id)
            li.update_db(status=1, doc_id=dupe.doc_id)
            return 0
    
        # ignore old and published paper:
        if paper_is_old(doc):
            li.update_db(status=1, doc_id=None)
            debug(1, "ignoring already published paper")
            return 0

        # don't show papers (incl HTML pages) from newly added source
        # pages in news feed:
        if doc.source.status == 0:
            debug(2, "new source page: setting found_date to 1970")
            doc.found_date = '1970-01-01 12:00:00'
        
    doc.update_db()
    li.update_db(status=1, doc_id=doc.doc_id)

Python normalize_url примеры использования