def svnauth(self): # DO NOT default the user to hg's getuser(). If you provide # *any* default username to Subversion, it won't use any remembered # username for the desired realm, breaking OS X Keychain support, # GNOME keyring support, and all similar tools. user = self.ui.config('hgsubversion', 'username') passwd = self.ui.config('hgsubversion', 'password') url = util.normalize_url(self.path) user, passwd, url = svnwrap.parse_url(url, user, passwd) return url, user, passwd
def check_steppingstone(page): debug(3, "checking: intermediate page leading to article?") # steppingstone pages from known repositories: redir_patterns = [ # arxiv.org, springer.com, researchgate, etc.: (re.compile('<meta name="citation_pdf_url" content="(.+?)"'), (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))), # philpapers.org: (re.compile('class=\'outLink\' href="http://philpapers.org/go.pl[^"]+u=(http.+?)"'), (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))), # philsci-archive.pitt.edu: (re.compile('<meta name="eprints.document_url" content="(.+?)"'), (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))), # sciencedirect.com: (re.compile('pdfurl="(.+?)"'), (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))), # PLOSOne: (re.compile('(http://www.plosone.org/article/.+?representation=PDF)" id="downloadPdf"'), (lambda m: page.make_absolute(requests.utils.unquote(m.group(1))))), # Google Drive: (re.compile('content="https://drive.google.com/file/d/(.+?)/'), (lambda m: 'https://googledrive.com/host/{}'.format(requests.utils.unquote(m.group(1))))) ] for (pattern, retr_target) in redir_patterns: m = pattern.search(page.html) if m: target = util.normalize_url(retr_target(m)) if target == page.url: return None debug(3, "yes: repository page for %s", target) return target # other steppingstone pages must have link(s) to a single pdf file: targets = set(u for u in page.xpath('//a/@href') if re.search('.pdf$', u, re.I)) if len(targets) != 1: debug(3, "no: %s links to pdf files", len(targets)) return None debug(3, "yes: single link to pdf file %s", targets[0]) target = util.normalize_url(page.make_absolute(targets[0])) return target
def source(): url = normalize_url(request.args.get('url')) db = get_db() cur = db.cursor(MySQLdb.cursors.DictCursor) query = "SELECT * FROM sources WHERE url = %s" cur.execute(query, (url,)) app.logger.debug(cur._last_executed) sources = cur.fetchall() if not sources: return jsonify({ 'msg': 'there seems to be no source with that url' }) source = sources[0] return jsonify({ 'msg': 'OK', 'source': source })
def updateexternals(ui, args, repo, **opts): """update repository externals """ if len(args) > 2: raise hgutil.Abort(_('updateexternals expects at most one changeset')) node = None if len(args) == 2: svnurl = util.normalize_url(repo.ui.expandpath(args[0])) args = args[1:] else: svnurl = util.normalize_url(repo.ui.expandpath('default')) if args: node = args[0] svnroot = getsvninfo(svnurl)[1] # Retrieve current externals status try: oldext = file(repo.join('svn/externals'), 'rb').read() except IOError: oldext = '' newext = '' ctx = repo[node] if '.hgsvnexternals' in ctx: newext = ctx['.hgsvnexternals'].data() updater = externalsupdater(ui, repo) actions = computeactions(ui, repo, svnroot, oldext, newext) for action, ext in actions: if action == 'u': updater.update(ext[0], ext[1], ext[2], ext[3]) elif action == 'd': updater.delete(ext[0]) else: raise hgutil.Abort(_('unknown update actions: %r') % action) file(repo.join('svn/externals'), 'wb').write(newext)
def get(self, state, *args, **kwargs): # Resolve source first line = state[0].split(':', 1)[1] source, pegrev = parsedefinition(line)[2:4] try: # Getting the root SVN repository URL is expensive. # Assume the externals is absolute. source = resolvesource(self._ui, None, source) except RelativeSourceError: svnurl = self._ctx._repo.ui.expandpath('default') svnroot = getsvninfo(util.normalize_url(svnurl))[1] source = resolvesource(self._ui, svnroot, source) if pegrev is not None: source = source + '@' + pegrev state = (source, state[1]) # hg-1.7.4-c19b9282d3a7 introduced the overwrite argument return super(svnsubrepo, self).get(state, *args, **kwargs)
def get(self, state, *args, **kwargs): # Resolve source first line = state[0].split(':', 1)[1] source, pegrev = parsedefinition(line)[2:4] try: # Getting the root SVN repository URL is expensive. # Assume the externals is absolute. source = resolvesource(self.ui, None, source) except RelativeSourceError: svnurl = self._ctx._repo.ui.expandpath('default') svnroot = getsvninfo(util.normalize_url(svnurl))[1] source = resolvesource(self.ui, svnroot, source) # hg 1.9 and higher, append the rev as a peg revision to # the source URL, so we cannot add our own. We assume # that "-r10 url@2" will be similar to "url@10" most of # the time. state = (source, state[1]) return super(svnsubrepo, self).get(state, *args, **kwargs)
def find_new_pages(self, name): # searches for papers pages matching author name, returns urls of new pages pages = set() search_terms = [ # careful with google.com: don't block sites.google.com... '-site:academia.edu', '-site:wikipedia.org', '-site:philpapers.org', '-filetype:pdf', '~philosophy', '(publications OR articles OR papers OR "in progress" OR forthcoming)', ] # search full name first, then last name only: search_phrase = u'"{}" '.format(name) + ' '.join(search_terms), searchresults = set(self.websearch(search_phrase)) search_phrase = u'"{}" '.format(name.split()[-1]) + ' '.join(search_terms) searchresults |= set(self.websearch(search_phrase)) for url in searchresults: logger.debug("\n") url = normalize_url(url) # check if url already known: cur = self.get_db().cursor() cur.execute("SELECT 1 FROM sources WHERE url = %s", (url,)) rows = cur.fetchall() if rows: logger.info(u"%s already known", url) continue try: r = self.fetch(url) except: logger.info(u"cannot retrieve %s", url) else: score = self.evaluate(r, name) if score < 0.7: logger.info(u"%s doesn't look like a papers page", url) continue dupe = self.is_duplicate(url) if dupe: logger.info(u"%s is a duplicate of already known %s", url, dupe) continue pages.add(url) return pages
def get(self, state, *args, **kwargs): # Resolve source first line = state[0].split(':', 1)[1] source, pegrev = parsedefinition(line)[2:4] try: # Getting the root SVN repository URL is expensive. # Assume the externals is absolute. source = resolvesource(self._ui, None, source) except RelativeSourceError: svnurl = self._ctx._repo.ui.expandpath('default') svnroot = getsvninfo(util.normalize_url(svnurl))[1] source = resolvesource(self._ui, svnroot, source) # hg < 1.9 svnsubrepo calls "svn checkout" with --rev # only, so peg revisions are correctly used. 1.9 and # higher, append the rev as a peg revision to the source # URL, so we cannot add our own. We assume that "-r10 # url@2" will be similar to "url@10" most of the time. if pegrev is not None and passpegrev: source = source + '@' + pegrev state = (source, state[1]) # hg-1.7.4-c19b9282d3a7 introduced the overwrite argument return super(svnsubrepo, self).get(state, *args, **kwargs)
def status_file_for(endpoint): import os.path, tempfile tmpdir = tempfile.gettempdir() status = util.normalize_url(endpoint) return os.path.join(tmpdir, status)
def editsource(): source_id = int(request.form['source_id']) status = request.form['status'] source_type = request.form['type'] url = normalize_url(request.form['url']) default_author = request.form['default_author'] source_name = request.form['name'] db = get_db() cur = db.cursor() if request.form['submit'] == 'Remove Source': if source_type == '3': # remove blog subscription on superfeedr: from superscription import Superscription ss = Superscription(config('SUPERFEEDR_USER'), password=config('SUPERFEEDR_PASSWORD')) success = False try: app.logger.debug('removing {} from superfeedr'.format(url)) success = ss.unsubscribe(hub_topic=url) except: pass if not success: msg = 'could not unsubscribe blog from superfeedr!' if ss.response.status_code: msg += ' status {}'.format(ss.response.status_code) else: msg += ' no response from superfeedr server' return jsonify({'msg':msg}) query = "DELETE FROM sources WHERE source_id = %s" cur.execute(query, (source_id,)) app.logger.debug(cur._last_executed) db.commit() return jsonify({'msg':'OK'}) else: if source_id != 0: # Is it sensible to change the url of an existing source # page? Not really if an author has moved their site with # all their documents, because then all the links and # document URLs are also new. But sometimes the url # changes and all the links remain the same, e.g. when # Kent Bach's homepage moved from /~kbach to /kbach. query = "UPDATE sources set url=%s, status=%s, type=%s, default_author=%s, name=%s WHERE source_id=%s" cur.execute(query, (url,status,source_type,default_author,source_name,source_id)) app.logger.debug(cur._last_executed) db.commit() else: query = '''INSERT INTO sources (url, status, type, default_author, name) VALUES(%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE status=%s, type=%s, default_author=%s, name=%s, source_id=LAST_INSERT_ID(source_id)''' cur.execute(query, (url,status,source_type,default_author,source_name,status,source_type,default_author,source_name)) app.logger.debug(cur._last_executed) db.commit() insert_id = cur.lastrowid if source_type == '3': # register new blog subscription on superfeedr: from superscription import Superscription ss = Superscription(config('SUPERFEEDR_USER'), password=config('SUPERFEEDR_PASSWORD')) msg = 'could not register blog on superfeedr!' try: callback=request.url_root+'new_blog_post/{}'.format(insert_id) app.logger.debug('suscribing to {} on {} via superfeedr'.format(url,callback)) success = ss.subscribe(hub_topic=url, hub_callback=callback) if success: return jsonify({'msg':'OK'}) except: if ss.response.status_code: msg += ' status {}'.format(ss.response.status_code) else: msg += ' no response from superseedr server' return jsonify({'msg':msg}) return jsonify({'msg':'OK'})
def scrape(source, keep_tempfiles=False): """ Look for new papers linked to on the source page (and check for revisions to known papers). Issues to keep in mind: (1) Links on personal pages often lead to old papers that have been published long ago. (That's true even for newly added links, when people upload older papers.) We don't want to list these papers in the news feed, nor do we need to check them for revisions. So if we find a link to an old and published paper, we treat it like a link to a non-paper. (If a manuscript changes into a published paper, we keep the paper in the database because it still ought to show up as "new papers found on x/y/z" and because it might have been used to train personal filters, but we remove the doc_id from the link, thereby marking the link as known but irrelevant.) (2) Sometimes links to papers are temporarily broken, or there's a temporary problem with the metadata extraction. So if we encounter an error while processing a (promising) new link, we try again once or twice in the course of the next week (using link.found_date). (3) To check for revisions of existing manuscripts (and, more unusually, new papers appearing at an old url), we have to occasionally re-process known links. But we don't want to re-parse all documents all the time. Instead, we select a few old papers (i.e., links with an associated doc_id that are still on the page, ordered by last_checked). (4) We could remove all links from the db that are no longer on the page, but it's probably not worth the effort. Moreover, pages are sometimes temporarily replaced by "under maintanance" pages (for example), and then we may not want to re-process all links once the page comes back. So we simply ignore disappeared links: they remain in the db, but they are never revisited until they reappear on the page. (5) If a page is processed for the first time (status==0 in the db), we don't want to display all linked papers in the news feed. Nonetheless, we process all links so that we can check for revisions (think of the Stanford Encyclopedia). To avoid displaying the papers as new, we mark them with a found_date of 1970. """ debug(1, "checking links on %s", source.url) # go to page: browser = Browser(use_virtual_display=True) try: browser.goto(source.url) except Exception as e: logger.warning('connection to source %s failed: %s', source.url, str(e)) source.update_db(status=error.code['connection failed']) return 0 if browser.current_url != source.url: # redirects of journal pages are OK (e.g. from /current to # /nov-2015), but redirects of personal papers pages are often # caused by pages having disappeared; the redirect can then # take us e.g. to CMU's general document archive; we don't # want that. So here we wait for manual approval of the new # url. if source.sourcetype == 'personal': logger.warning('%s redirects to %s', source.url, browser.current_url) source.update_db(status=301) return 0 else: debug(2, '%s redirected to %s', source.url, browser.current_url) # look for new links: source.set_html(browser.page_source) new_links = {} # url => Link old_links = {} # url => Link for li in browser.find_elements_by_tag_name("a"): if not li.is_displayed() or not li.get_attribute('href'): continue href = li.get_attribute('href') if is_bad_url(href): debug(3, 'ignoring link to %s (bad url)', href) continue href = util.normalize_url(source.make_absolute(href)) old_link = source.old_link(href) if old_link: debug(3, 'link to %s is old: %s', href, old_link.url) old_links[href] = old_link old_links[href].element = li else: debug(1, 'new link: "%s" %s', li.text, href) new_links[href] = Link(url=href, source=source, element=li) # Selenium doesn't tell us when a site yields a 404, 401, 500 # etc. error. But we can usually tell from the fact that there are # few known links on the error page: debug(1, 'status {}, old links: {}'.format(source.status, len(old_links.keys()))) if source.status > 0 and len(old_links.keys()) <= 1: debug(1, 'suspiciously few old links, checking status code') status, r = util.request_url(source.url) if status != 200: logger.warning('error %s at source %s', status, source.url) source.update_db(status=status) return 0 source.update_db(status=1) # process new links: if new_links: for li in new_links.values(): debug(1, '\nprocessing new link to %s', li.url) process_link(li) # for testing: one link only # return 1 else: debug(1, "no new links") # re-process recently found old links that generated errors: for li in old_links.values(): if li.status > 9: tdelta = datetime.now() - li.found_date if tdelta.days < 5: debug(1, 're-checking recent link %s with status %s', li.url, li.status) process_link(li, force_reprocess=True) # re-check old links to papers for revisions: MAX_REVCHECK = 3 goodlinks = [li for li in old_links.values() if li.doc_id] for li in sorted(goodlinks, key=lambda x:x.last_checked)[:MAX_REVCHECK]: debug(1, 're-checking old link to paper %s for revisions', li.url) process_link(li) if not keep_tempfiles: remove_tempdir()
def process_link (li, force_reprocess=False, redir_url=None, keep_tempfiles=False, recurse=0): """ Fetch url, check for http errors and steppingstones, filter spam, parse candidate papers, check for duplicates, check if published before last year. Links often lead to intermediate pages (e.g. on repositories) with another link to the actual paper. In this case, we only store the original link in the 'links' table, so the 'doc' entry has a url that doesn't match any link. To process the new link, process_link is called again, with redir_url set to the new url and recurse += 1. If force_reprocess is False and the link has already been checked at some point, if_modified_since and etag headers are sent. """ # ignore links to old and published papers: li.context = li.html_context() debug(2, "link context: %s", li.context) if context_suggests_published(li.context): li.update_db(status=1, doc_id=None) return 0 # fetch url and handle errors, redirects, etc.: url = redir_url or li.url r = li.fetch(url=url, only_if_modified=not(force_reprocess)) if not r: return 0 if r.url != url: # redirected url = util.normalize_url(r.url) # now we treat li as if it directly led to the redirected document if r.filetype not in ('html', 'pdf', 'doc', 'rtf'): li.update_db(status=error.code['unsupported filetype']) return debug(1, "unsupported filetype: %s", r.filetype) doc = Doc(url=url, r=r, link=li, source=li.source) if r.filetype == 'html': r.encoding = 'utf-8' doc.page = Webpage(url, html=r.text) debug(6, "\n====== %s ======\n%s\n======\n", url, r.text) # check for steppingstone pages with link to a paper: target_url = check_steppingstone(doc.page) if target_url and recurse < 3: debug(1, "steppingstone to %s", target_url) return process_link(li, redir_url=target_url, force_reprocess=force_reprocess, recurse=recurse+1) # Genuine papers are almost never in HTML format, and almost # every HTML page is not a paper. The few exceptions (such as # entries on SEP) tend to require special parsing. Hence the # following special treatment. If people start posting # articles on medium or in plain HTML, we might return to the # old procedure of converting the page to pdf and treating it # like any candidate paper. doc.content = doc.page.text() doc.numwords = len(doc.content.split()) doc.numpages = 1 import docparser.webpageparser as htmlparser if not htmlparser.parse(doc): debug(1, "page ignored") li.update_db(status=1) return 0 else: # save as pdf: try: doc.tempfile = save_local(r) except: return li.update_db(status=error.code['cannot save local file']) if r.filetype != 'pdf': try: doc.tempfile = convert_to_pdf(doc.tempfile) except: debug(1, 'pdf conversion failed!') return li.update_db(status=error.code['pdf conversion failed']) try: pdfmeta = pdfinfo(doc.tempfile) doc.numpages = int(pdfmeta['Pages']) except: debug(1, 'pdfinfo failed!') return li.update_db(status=error.code['pdfinfo failed']) debug(2, 'pdf has %s pages', doc.numpages) # convert to xml: doc.xmlfile = doc.tempfile.rsplit('.')[0] + '.xml' if doc.numpages > 10: # ocr only first 7 + last 3 pages if necessary: ocr_ranges = [(1,7), (doc.numpages-2,doc.numpages)] else: ocr_ranges = None try: engine = pdf2xml(doc.tempfile, doc.xmlfile, keep_tempfiles=keep_tempfiles, ocr_ranges=ocr_ranges) except Exception as e: debug(1, "converting pdf to xml failed: %s", e) return li.update_db(status=error.code['pdf conversion failed']) doc.content = util.strip_xml(readfile(doc.xmlfile)) debug(5, "text content:\n%s", doc.content) if engine == 'pdftohtml': doc.numwords = len(doc.content.split()) else: doc.ocr = True if doc.numpages > 10: # extrapolate numwords from numpages and the number of words # on the ocr'ed pages: doc.numwords = len(doc.content.split()) * doc.numpages/10 else: doc.numwords = len(doc.content.split()) # guess doc type (paper, book, review, etc.): import doctyper.doctyper as doctyper doc.doctype = doctyper.evaluate(doc) # extract metadata: import docparser.paperparser as paperparser if not paperparser.parse(doc, keep_tempfiles=keep_tempfiles): logger.warning("metadata extraction failed for %s", url) li.update_db(status=error.code['parser error']) return 0 # estimate whether doc is not a handout, cv etc.: import doctyper.paperfilter as paperfilter paperprob = paperfilter.evaluate(doc) doc.is_paper = int(paperprob * 100) if doc.is_paper < 50: li.update_db(status=1) debug(1, "spam: paper score %s < 50", doc.is_paper) return 0 # estimate whether doc is on philosophy: import doctyper.philosophyfilter as philosophyfilter try: philprob = philosophyfilter.evaluate(doc) except UntrainedClassifierException as e: philprob = 0.9 doc.is_philosophy = int(philprob * 100) if doc.is_philosophy < 50: li.update_db(status=1) debug(1, "spam: philosophy score %s < 50", doc.is_philosophy) return 0 # TODO: classify for main topics? if li.doc_id: # check for revisions: olddoc = Doc(doc_id=li.doc_id) olddoc.load_from_db() if doc.content != olddoc.content: sm = SequenceMatcher(None, doc.content, olddoc.content) match_ratio = sm.ratio() if match_ratio < 0.8: debug(1, "substantive revisions, ratio %s", match_ratio) doc.earlier_id = olddoc.doc_id if not doc.earlier_id: li.update_db(status=1) debug(1, "no substantive revisions") return 0 else: # check for duplicates: dupe = get_duplicate(doc) if dupe: debug(1, "duplicate of document %s", dupe.doc_id) li.update_db(status=1, doc_id=dupe.doc_id) return 0 # ignore old and published paper: if paper_is_old(doc): li.update_db(status=1, doc_id=None) debug(1, "ignoring already published paper") return 0 # don't show papers (incl HTML pages) from newly added source # pages in news feed: if doc.source.status == 0: debug(2, "new source page: setting found_date to 1970") doc.found_date = '1970-01-01 12:00:00' doc.update_db() li.update_db(status=1, doc_id=doc.doc_id)