def check_crawl_autoaddtext(gz, ind, topurl, outd): ds = create(outd) ds.run_procedure("cfg_text2git") with chpwd(outd): # TODO -- dataset argument template_kwargs = { 'url': topurl, 'a_href_match_': '.*', } if gz: template_kwargs['archives_re'] = "\.gz$" crawl_init(template_kwargs, save=True, template='simple_with_archives') try: crawl() except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) ok_clean_git(outd) ok_file_under_git(outd, "anothertext", annexed=False) ok_file_under_git(outd, "d/textfile", annexed=False) ok_file_under_git(outd, "d/tooshort", annexed=True) if 'compressed.dat.gz' in TEST_TREE2: if gz: ok_file_under_git(outd, "compressed.dat", annexed=False) ok_file_has_content(op.join(outd, "compressed.dat"), u"мама мыла раму") else: ok_file_under_git(outd, "compressed.dat.gz", annexed=True) else: raise SkipTest( "Need datalad >= 0.11.2 to test .gz files decompression")
def test_obscure_names(path): bucket = "datalad-test2-obscurenames-versioned" get_test_providers('s3://' + bucket) # to verify having s3 credentials create(path) with externals_use_cassette('test_simple_s3_test2_obscurenames_versioned_crawl_ext'), \ chpwd(path): crawl_init(template="simple_s3", args=dict(bucket=bucket), save=True) crawl() # fun with unicode was postponed ok_clean_git(path, annex=True) for f in ['f &$=@:+,?;', "f!-_.*'( )", 'f 1', 'f [1][2]']: ok_file_under_git(path, f, annexed=True)
def test_crawl_autoaddtext(ind, topurl, outd): ds = create(outd, text_no_annex=True) with chpwd(outd): # TODO -- dataset argument crawl_init( {'url': topurl, 'a_href_match_': '.*'} , save=True , template='simple_with_archives') crawl() ok_clean_git(outd) ok_file_under_git(outd, "anothertext", annexed=False) ok_file_under_git(outd, "d/textfile", annexed=False) ok_file_under_git(outd, "d/tooshort", annexed=True)
def _test_crawl_init_error_patch(return_value, exc, exc_msg, d): ar = AnnexRepo(d, create=True) with patch('datalad_crawler.crawl_init.load_pipeline_from_template', return_value=lambda dataset: return_value) as cm: with chpwd(d): with assert_raises(exc) as cm2: crawl_init(args=['dataset=Baltimore'], template='openfmri') assert_in(exc_msg, str(cm2.exception)) cm.assert_called_with('openfmri', None, return_only=True, kwargs=OrderedDict([('dataset', 'Baltimore') ]))
def _test_crawl_init(args, template, template_func, save, target_value, tmpdir): ar = AnnexRepo(tmpdir, create=True) with chpwd(tmpdir): crawl_init(args=args, template=template, template_func=template_func, save=save) eq_(exists(CRAWLER_META_DIR), True) eq_(exists(CRAWLER_META_CONFIG_PATH), True) f = open(CRAWLER_META_CONFIG_PATH, 'r') contents = f.read() eq_(contents, target_value) if save: ds = Dataset(tmpdir) ok_clean_git(tmpdir, annex=isinstance(ds.repo, AnnexRepo))
def test_crawl(tempd): if not _get_github_cred().is_known: raise SkipTest("no github credential") ds = create(tempd) with chpwd(tempd): crawl_init(template='gh', save=True, args={ 'org': 'datalad-collection-1', 'include': 'kaggle' }) crawl() subdss = ds.subdatasets(fulfilled=True, result_xfm='datasets') assert all('kaggle' in d.path for d in subdss) assert_greater(len(subdss), 1) assert_false(ds.repo.dirty)
def _test_drop(path, drop_immediately): s3url = 's3://datalad-test0-nonversioned' providers = get_test_providers(s3url) # to verify having s3 credentials # vcr tape is getting bound to the session object, so we need to # force re-establishing the session for the bucket. # TODO (in datalad): make a dedicated API for that, now too obscure _ = providers.get_status(s3url, allow_old_session=False) create(path) # unfortunately this doesn't work without force dropping since I guess vcr # stops and then gets queried again for the same tape while testing for # drop :-/ with chpwd(path): crawl_init( template="simple_s3", args=dict( bucket="datalad-test0-nonversioned", drop=True, drop_force=True, # so test goes faster drop_immediately=drop_immediately, ), save=True) if drop_immediately: # cannot figure out but taping that interaction results in # git annex addurl error. No time to figure it out # so we just crawl without vcr for now. TODO: figure out WTF with chpwd(path): crawl() else: with externals_use_cassette( 'test_simple_s3_test0_nonversioned_crawl_ext' + ('_immediately' if drop_immediately else '')), \ chpwd(path): crawl() # test that all was dropped repo = AnnexRepo(path, create=False) files = glob(_path_(path, '*')) eq_(len(files), 8) for f in files: assert_false(repo.file_has_content(f))
def test_drop(path): get_test_providers('s3://datalad-test0-nonversioned') # to verify having s3 credentials create(path) # unfortunately this doesn't work without force dropping since I guess vcr # stops and then gets queried again for the same tape while testing for # drop :-/ with externals_use_cassette('test_simple_s3_test0_nonversioned_crawl_ext'), \ chpwd(path): crawl_init(template="simple_s3", args=dict( bucket="datalad-test0-nonversioned", drop=True, drop_force=True # so test goes faster ), save=True ) crawl() # test that all was dropped repo = AnnexRepo(path, create=False) files = glob(_path_(path, '*')) eq_(len(files), 8) for f in files: assert_false(repo.file_has_content(f))