def test_transform_ipynb_uri(): test_data = ( # GIST_RGX ('1234', u'/1234'), ('1234/', u'/1234'), # GIST_URL_RGX ('https://gist.github.com/username/1234', u'/1234'), ('https://gist.github.com/username/1234/', u'/1234'), # GITHUB_URL_RGX ('https://github.com/user/repo/blob/master/path/file.ipynb', u'/github/user/repo/blob/master/path/file.ipynb'), ('http://github.com/user/repo/blob/master/path/file.ipynb', u'/github/user/repo/blob/master/path/file.ipynb'), # URL ('https://example.org/ipynb', u'/urls/example.org/ipynb'), ('http://example.org/ipynb', u'/url/example.org/ipynb'), ('example.org/ipynb', u'/url/example.org/ipynb'), (u'example.org/ipynb', u'/url/example.org/ipynb'), ('https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb', u'/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb'), ) for (ipynb_uri, expected_output) in test_data: output = transform_ipynb_uri(ipynb_uri) assert output == expected_output
def test_transform_ipynb_uri(): test_data = ( # GIST_RGX ("1234", u"/1234"), ("1234/", u"/1234"), # GIST_URL_RGX ("https://gist.github.com/user-name/1234", u"/1234"), ("https://gist.github.com/user-name/1234/", u"/1234"), # GITHUB_URL_RGX ( "https://github.com/user-name_/repo-name_/blob/master/path/file.ipynb", u"/github/user-name_/repo-name_/blob/master/path/file.ipynb", ), ( "http://github.com/user-name_/repo-name_/blob/master/path/file.ipynb", u"/github/user-name_/repo-name_/blob/master/path/file.ipynb", ), ( "https://github.com/user-name_/repo-name_/tree/master/path/", u"/github/user-name_/repo-name_/tree/master/path/", ), # GITHUB_USER_RGX ("ipy-thon", u"/github/ipy-thon/"), # GITHUB_USER_REPO_RGX ("ipy-thon/ipy-thon", u"/github/ipy-thon/ipy-thon/tree/master/"), # DropBox Urls (u"http://www.dropbox.com/s/bar/baz.qux", u"/url/dl.dropbox.com/s/bar/baz.qux"), ( u"https://www.dropbox.com/s/zip/baz.qux", u"/urls/dl.dropbox.com/s/zip/baz.qux", ), ( u"https://www.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb?dl=1", u"/urls/dl.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb", ), # URL ("https://example.org/ipynb", u"/urls/example.org/ipynb"), ("http://example.org/ipynb", u"/url/example.org/ipynb"), ("example.org/ipynb", u"/url/example.org/ipynb"), (u"example.org/ipynb", u"/url/example.org/ipynb"), ( "https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb", u"/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb", ), ( "https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb?query=string&is=1", u"/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb/%3Fquery%3Dstring%26is%3D1", ), ) uri_rewrite_list = provider_uri_rewrites(default_rewrites) for ipynb_uri, expected_output in test_data: output = utils.transform_ipynb_uri(ipynb_uri, uri_rewrite_list) nt.assert_equal( output, expected_output, "%s => %s != %s" % (ipynb_uri, output, expected_output), )
def test_transform_ipynb_uri(): test_data = ( # GIST_RGX ('1234', u'/1234'), ('1234/', u'/1234'), # GIST_URL_RGX ('https://gist.github.com/user-name/1234', u'/1234'), ('https://gist.github.com/user-name/1234/', u'/1234'), # GITHUB_URL_RGX ('https://github.com/user-name_/repo-name_/blob/master/path/file.ipynb', u'/github/user-name_/repo-name_/blob/master/path/file.ipynb'), ('http://github.com/user-name_/repo-name_/blob/master/path/file.ipynb', u'/github/user-name_/repo-name_/blob/master/path/file.ipynb'), ('https://github.com/user-name_/repo-name_/tree/master/path/', u'/github/user-name_/repo-name_/tree/master/path/'), # GITHUB_USER_RGX ('ipy-thon', u'/github/ipy-thon/'), # GITHUB_USER_REPO_RGX ('ipy-thon/ipy-thon', u'/github/ipy-thon/ipy-thon/tree/master/'), #DropBox Urls ( u'http://www.dropbox.com/s/bar/baz.qux', u'/url/dl.dropbox.com/s/bar/baz.qux'), ( u'https://www.dropbox.com/s/zip/baz.qux', u'/urls/dl.dropbox.com/s/zip/baz.qux'), ( u'https://www.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb?dl=1', u'/urls/dl.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb'), # URL ('https://example.org/ipynb', u'/urls/example.org/ipynb'), ('http://example.org/ipynb', u'/url/example.org/ipynb'), ('example.org/ipynb', u'/url/example.org/ipynb'), (u'example.org/ipynb', u'/url/example.org/ipynb'), ('https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb', u'/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb'), ('https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb?query=string&is=1', u'/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb/%3Fquery%3Dstring%26is%3D1'), ) uri_rewrite_list = provider_uri_rewrites(default_rewrites) for ipynb_uri, expected_output in test_data: output = utils.transform_ipynb_uri(ipynb_uri, uri_rewrite_list) nt.assert_equal(output, expected_output, "%s => %s != %s" % ( ipynb_uri, output, expected_output ))
def test_transform_ipynb_uri(): test_data = ( # GIST_RGX ('1234', u'/1234'), ('1234/', u'/1234'), # GIST_URL_RGX ('https://gist.github.com/user-name/1234', u'/1234'), ('https://gist.github.com/user-name/1234/', u'/1234'), # GITHUB_URL_RGX ('https://github.com/user-name_/repo-name_/blob/master/path/file.ipynb', u'/github/user-name_/repo-name_/blob/master/path/file.ipynb'), ('http://github.com/user-name_/repo-name_/blob/master/path/file.ipynb', u'/github/user-name_/repo-name_/blob/master/path/file.ipynb'), ('https://github.com/user-name_/repo-name_/tree/master/path/', u'/github/user-name_/repo-name_/tree/master/path/'), # GITHUB_USER_RGX ('ipy-thon', u'/github/ipy-thon/'), # GITHUB_USER_REPO_RGX ('ipy-thon/ipy-thon', u'/github/ipy-thon/ipy-thon/tree/master/'), #DropBox Urls ( u'http://www.dropbox.com/s/bar/baz.qux', u'/url/dl.dropbox.com/s/bar/baz.qux'), ( u'https://www.dropbox.com/s/zip/baz.qux', u'/urls/dl.dropbox.com/s/zip/baz.qux'), ( u'https://www.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb', u'/urls/dl.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb'), # URL ('https://example.org/ipynb', u'/urls/example.org/ipynb'), ('http://example.org/ipynb', u'/url/example.org/ipynb'), ('example.org/ipynb', u'/url/example.org/ipynb'), (u'example.org/ipynb', u'/url/example.org/ipynb'), ('https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb', u'/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb'), ) for (ipynb_uri, expected_output) in test_data: output = utils.transform_ipynb_uri(ipynb_uri) nt.assert_equal(output, expected_output, "%s => %s != %s" % ( ipynb_uri, output, expected_output ))
def test_transform_ipynb_uri(): test_data = ( # GIST_RGX ("1234", u"/1234"), ("1234/", u"/1234"), # GIST_URL_RGX ("https://gist.github.com/user-name/1234", u"/1234"), ("https://gist.github.com/user-name/1234/", u"/1234"), # GITHUB_URL_RGX ( "https://github.com/user-name_/repo-name_/blob/master/path/file.ipynb", u"/github/user-name_/repo-name_/blob/master/path/file.ipynb", ), ( "http://github.com/user-name_/repo-name_/blob/master/path/file.ipynb", u"/github/user-name_/repo-name_/blob/master/path/file.ipynb", ), ( "https://github.com/user-name_/repo-name_/tree/master/path/", u"/github/user-name_/repo-name_/tree/master/path/", ), # GITHUB_USER_RGX ("ipy-thon", u"/github/ipy-thon/"), # GITHUB_USER_REPO_RGX ("ipy-thon/ipy-thon", u"/github/ipy-thon/ipy-thon/tree/master/"), # DropBox Urls (u"http://www.dropbox.com/s/bar/baz.qux", u"/url/dl.dropbox.com/s/bar/baz.qux"), (u"https://www.dropbox.com/s/zip/baz.qux", u"/urls/dl.dropbox.com/s/zip/baz.qux"), ( u"https://www.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb?dl=1", u"/urls/dl.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb", ), # URL ("https://example.org/ipynb", u"/urls/example.org/ipynb"), ("http://example.org/ipynb", u"/url/example.org/ipynb"), ("example.org/ipynb", u"/url/example.org/ipynb"), (u"example.org/ipynb", u"/url/example.org/ipynb"), ( "https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb", u"/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb", ), ) uri_rewrite_list = provider_uri_rewrites(default_rewrites) for ipynb_uri, expected_output in test_data: output = utils.transform_ipynb_uri(ipynb_uri, uri_rewrite_list) nt.assert_equal(output, expected_output, "%s => %s != %s" % (ipynb_uri, output, expected_output))
def test_transform_ipynb_uri(): test_data = ( # GIST_RGX ('1234', u'/1234'), ('1234/', u'/1234'), # GIST_URL_RGX ('https://gist.github.com/username/1234', u'/1234'), ('https://gist.github.com/username/1234/', u'/1234'), # GITHUB_URL_RGX ('https://github.com/user/repo/blob/master/path/file.ipynb', u'/github/user/repo/blob/master/path/file.ipynb'), ('http://github.com/user/repo/blob/master/path/file.ipynb', u'/github/user/repo/blob/master/path/file.ipynb'), # GITHUB_USER_RGX ('ipython', u'/github/ipython/'), # GITHUB_USERREPO_RGX ('ipython/ipython', u'/github/ipython/ipython/tree/master/'), #DropBox Urls ( u'http://www.dropbox.com/s/bar/baz.qux', u'/url/dl.dropbox.com/s/bar/baz.qux'), ( u'https://www.dropbox.com/s/zip/baz.qux', u'/urls/dl.dropbox.com/s/zip/baz.qux'), ( u'https://www.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb', u'/urls/dl.dropbox.com/sh/mhviow274da2wly/CZKwRRcA0k/nested/furthernested/User%2520Interface.ipynb'), # URL ('https://example.org/ipynb', u'/urls/example.org/ipynb'), ('http://example.org/ipynb', u'/url/example.org/ipynb'), ('example.org/ipynb', u'/url/example.org/ipynb'), (u'example.org/ipynb', u'/url/example.org/ipynb'), ('https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb', u'/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb'), ) for (ipynb_uri, expected_output) in test_data: output = utils.transform_ipynb_uri(ipynb_uri) nt.assert_equal(output, expected_output)
def test_transform_ipynb_uri(): test_data = ( # GIST_RGX ('1234', u'/1234'), ('1234/', u'/1234'), # GIST_URL_RGX ('https://gist.github.com/username/1234', u'/1234'), ('https://gist.github.com/username/1234/', u'/1234'), # GITHUB_URL_RGX ('https://github.com/user/repo/blob/master/path/file.ipynb', u'/github/user/repo/blob/master/path/file.ipynb'), ('http://github.com/user/repo/blob/master/path/file.ipynb', u'/github/user/repo/blob/master/path/file.ipynb'), #DropBox Urls ( u'http://www.dropbox.com/u/bar/baz.qux', u'/url/dl.dropbox.com/u/bar/baz.qux'), ( u'https://www.dropbox.com/u/zip/baz.qux', u'/urls/dl.dropbox.com/u/zip/baz.qux'), # URL ('https://example.org/ipynb', u'/urls/example.org/ipynb'), ('http://example.org/ipynb', u'/url/example.org/ipynb'), ('example.org/ipynb', u'/url/example.org/ipynb'), (u'example.org/ipynb', u'/url/example.org/ipynb'), ('https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb', u'/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb'), ) for (ipynb_uri, expected_output) in test_data: output = utils.transform_ipynb_uri(ipynb_uri) nt.assert_equal(output, expected_output)
def test_transform_ipynb_uri(): test_data = ( # GIST_RGX ('1234', u'/1234'), ('1234/', u'/1234'), # GIST_URL_RGX ('https://gist.github.com/username/1234', u'/1234'), ('https://gist.github.com/username/1234/', u'/1234'), # GITHUB_URL_RGX ('https://github.com/user/repo/blob/master/path/file.ipynb', u'/github/user/repo/blob/master/path/file.ipynb'), ('http://github.com/user/repo/blob/master/path/file.ipynb', u'/github/user/repo/blob/master/path/file.ipynb'), # URL ('https://example.org/ipynb', u'/urls/example.org/ipynb'), ('http://example.org/ipynb', u'/url/example.org/ipynb'), ('example.org/ipynb', u'/url/example.org/ipynb'), (u'example.org/ipynb', u'/url/example.org/ipynb'), ('https://gist.github.com/user/1234/raw/a1b2c3/file.ipynb', u'/urls/gist.github.com/user/1234/raw/a1b2c3/file.ipynb'), ) for (ipynb_uri, expected_output) in test_data: output = utils.transform_ipynb_uri(ipynb_uri) assert output == expected_output
def insert_notebook(url, screenshot=True, nb=None): """ Returns ------- dict {'success': True/False} """ # TODO: do ajax-based async from web.models import Notebook # sanitize url url = url.replace('https', 'http') is_nbviewer = False try: url = unshorten_url(url) r = requests.get(url) if 'text/html' in r.headers['content-type']: # check that it's a notebook tmp_html = urlopen(url) is_nbviewer = ("Notebook on nbviewer" in tmp_html) if is_nbviewer: html_url = url else: html_url = urlparse.urljoin('http://nbviewer.ipython.org', transform_ipynb_uri(url)) print('Downloading %s' % html_url) html = urlopen(html_url) except (urllib2.HTTPError, urllib2.URLError, socket.timeout, ssl.SSLError, requests.exceptions.SSLError, requests.sessions.InvalidSchema) as e: if nb is not None: nb.failures_access += 1 print('Failed in downloading', e) return {'status': 'failure', 'reason': 'Failed accessing the notebook'} extracted = extraction.Extractor().extract(html, source_url=html_url) if len(extracted.titles) > 1: title = extracted.titles[1] else: title = extracted.descriptions[1] words_title = title.split(' ') if len(words_title) > 20: title = ' '.join(words_title[:20]) + ' ...' if len(extracted.descriptions) > 1: description = extracted.descriptions[1] else: description = '' words_description = description.split(' ') if len(words_description) > 40: description = ' '.join(words_description[:40]) + ' ...' # some more sanitation if title.startswith('This web site does not host'): # this is the nbviewer default title title = 'No title' title = title.strip(u'¶') #similar = Notebook.objects.filter(title=title, description=description) #if len(Notebook.objects.filter(title=title, description=description)) > 0: #return {'status': 'failure', 'reason': 'duplicate document', 'pk': similar[0].pk} if nb is None: obj, created = Notebook.objects.get_or_create(url=url) else: obj = nb created = False # screenshot if screenshot: out = make_screenshots(html_url, obj.pk) if out['status'] == 'failure': if created: obj.delete() else: obj.failures_access += 1 return out else: obj.thumb_img = out['thumb'] # XXX remove assert with error messages assert len(title) < 500 obj.title = title assert len(description) < 2000 obj.description = description assert len(html_url) < 1000 obj.html_url = html_url assert len(url) < 1000 obj.url = url obj.full_html = html obj.last_accessed_date = datetime.now().date() obj.save() return {'status': 'success', 'pk': obj.pk, 'created': created}