def test_addurls_dropped_urls(self=None, path=None): ds = Dataset(path).create(force=True) with swallow_logs(new_level=logging.WARNING) as cml: ds.addurls(self.json_file, "", "{subdir}//{name}", result_renderer='disabled') assert_re_in(r".*Dropped [0-9]+ row\(s\) that had an empty URL", str(cml.out))
def test_dashh(): stdout, stderr = run_main(['-h']) # Note: for -h we do not do ad-hoc tune up of Usage: to guarantee having # datalad instead of python -m nose etc, so we can only verify that we have # options listed assert_re_in(r'^Usage: .*\[', stdout.splitlines()[0]) assert_all_commands_present(stdout) assert_re_in('Use .--help. to get more comprehensive information', stdout.splitlines())
def test_incorrect_msg_interpolation(): with assert_raises(TypeError) as cme: TestUtils2().__call__() # this must be our custom exception assert_re_in("Failed to render.*kaboom.*not enough arguments", str(cme.value)) # there should be no exception if reported in the record path contains % TestUtils2().__call__("%eatthis")
def test_eval_results_plus_build_doc(): # test docs # docstring was build already: with swallow_logs(new_level=logging.DEBUG) as cml: TestUtils().__call__(1) assert_not_in("Building doc for", cml.out) # docstring accessible both ways: doc1 = Dataset.fake_command.__doc__ doc2 = TestUtils().__call__.__doc__ # docstring was built from Test_Util's definition: assert_equal(doc1, doc2) assert_in("TestUtil's fake command", doc1) assert_in("Parameters", doc1) assert_in("It's a number", doc1) # docstring shows correct override values of defaults in eval_params assert_re_in("Default:\\s+'tailored'", doc1, match=False) assert_re_in("Default:\\s+'item-or-list'", doc1, match=False) # docstring also contains eval_result's parameters: assert_in("result_filter", doc1) assert_in("return_type", doc1) assert_in("list", doc1) assert_in("None", doc1) assert_in("return value behavior", doc1) assert_in("dictionary is passed", doc1) # test eval_results is able to determine the call, a method of which it is # decorating: with swallow_logs(new_level=logging.DEBUG) as cml: Dataset('/does/not/matter').fake_command(3) assert_in( "Determined class of decorated function: {}" "".format(TestUtils().__class__), cml.out) # test results: result = TestUtils().__call__(2) assert_equal(len(list(result)), 2) result = Dataset('/does/not/matter').fake_command(3) assert_equal(len(list(result)), 3) # test absent side-effect of popping eval_defaults kwargs = dict(return_type='list') TestUtils().__call__(2, **kwargs) assert_equal(list(kwargs), ['return_type']) # test signature: from datalad.utils import getargspec assert_equal( getargspec(Dataset.fake_command)[0], ['number', 'dataset', 'result_fn']) assert_equal( getargspec(TestUtils.__call__)[0], ['number', 'dataset', 'result_fn'])
def test_assert_re_in(): assert_re_in(".*", "") assert_re_in(".*", ["any"]) # should do match not search assert_re_in("ab", "abc") assert_raises(AssertionError, assert_re_in, "ab", "cab") assert_raises(AssertionError, assert_re_in, "ab$", "abc") # Sufficient to have one entry matching assert_re_in("ab", ["", "abc", "laskdjf"]) assert_raises(AssertionError, assert_re_in, "ab$", ["ddd", ""]) # Tuples should be ok too assert_re_in("ab", ("", "abc", "laskdjf")) assert_raises(AssertionError, assert_re_in, "ab$", ("ddd", "")) # shouldn't "match" the empty list assert_raises(AssertionError, assert_re_in, "", [])
def test_incorrect_option(opts, err_str): # The first line used to be: # stdout, stderr = run_main((sys.argv[0],) + opts, expect_stderr=True, exit_code=2) # But: what do we expect to be in sys.argv[0] here? # It depends on how we invoke the test. # - nosetests -s -v datalad/cmdline/tests/test_main.py would result in: # sys.argv[0}=='nosetests' # - python -m nose -s -v datalad/cmdline/tests/test_main.py would result in: # sys.argv[0}=='python -m nose' # - python -c "import nose; nose.main()" -s -v datalad/cmdline/tests/test_main.py would result in: # sys.argv[0]=='-c' # This led to failure in case sys.argv[0] contained an option, that was # defined to be a datalad option too, therefore was a 'known_arg' and was # checked to meet its constraints. # But sys.argv[0] actually isn't used by main at all. It simply doesn't # matter what's in there. The only thing important to pass here is `opts`. stdout, stderr = run_main(opts, expect_stderr=True, exit_code=2) out = stdout + stderr assert_in("usage: ", out) assert_re_in(err_str, out, match=False)
def test_logging_to_a_file(dst=None): ok_(not exists(dst)) lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst) ok_(exists(dst)) # nothing was logged -- no file created msg = "Oh my god, they killed Kenny" lgr.error(msg) with open(dst) as f: lines = f.readlines() assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines) line = lines[0] ok_(msg in line) ok_('\033[' not in line, msg="There should be no color formatting in log files. Got: %s" % line) # verify that time stamp and level are present in the log line # do not want to rely on not having race conditions around date/time changes # so matching just with regexp # (...)? is added to swallow possible traceback logs regex = r"\[ERROR\]" if EnsureBool()(dl_cfg.get('datalad.log.timestamp', False)): regex = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} " + regex if EnsureBool()(dl_cfg.get('datalad.log.vmem', False)): regex += r' RSS/VMS: \S+/\S+( \S+)?\s*' regex += r"(\s+\S+\s*)? " + msg assert_re_in(regex, line, match=True) # Python's logger is ok (although not documented as supported) to accept # non-string messages, which could be str()'ed. We should not puke msg2 = "Kenny is alive" lgr.error(RuntimeError(msg2)) with open(dst) as f: assert_in(msg2, f.read()) # Close all handlers so windows is happy -- apparently not closed fast enough for handler in lgr.handlers: handler.close() assert_no_open_files(dst)
def test_interface(): di = Demo() import argparse parser = argparse.ArgumentParser() from datalad.cli.parser import setup_parser_for_interface setup_parser_for_interface(parser, di) with swallow_outputs() as cmo: assert_equal(parser.print_help(), None) assert (cmo.out) assert_equal(cmo.err, '') args = parser.parse_args(['42', '11', '1', '2', '--demoarg', '23']) assert_is(args.demoarg, 23) assert_equal(args.demoposarg, [42, 11]) assert_equal(args.demooptposarg1, 1) assert_equal(args.demooptposarg2, 2) # wrong type with swallow_outputs() as cmo: assert_raises(SystemExit, parser.parse_args, ['--demoarg', 'abc']) # that is what we dump upon folks atm. TODO: improve reporting of illspecified options assert_re_in(".*invalid constraint:int value:.*", cmo.err, re.DOTALL) # missing argument to option with swallow_outputs() as cmo: assert_raises(SystemExit, parser.parse_args, ['--demoarg']) assert_re_in(".*--demoarg: expected one argument", cmo.err, re.DOTALL) # missing positional argument with swallow_outputs() as cmo: assert_raises(SystemExit, parser.parse_args, ['']) # PY2|PY3 assert_re_in( ".*error: (too few arguments|the following arguments are required: demoposarg)", cmo.err, re.DOTALL)
def test_aggregation(path=None): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') ds.save(recursive=True) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_in_results(res, action='save', status="ok") # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == ensure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extract same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def test_CapturedException(): try: raise Exception("BOOM") except Exception as e: captured_exc = CapturedException(e) assert_re_in( r"BOOM \[test_captured_exception.py:test_CapturedException:[0-9]+\]", captured_exc.format_oneline_tb()) assert_re_in( r"^\[.*\]", captured_exc.format_oneline_tb(include_str=False)) # only traceback try: raise NotImplementedError except Exception as e: captured_exc = CapturedException(e) assert_re_in( r"NotImplementedError \[test_captured_exception.py:test_CapturedException:[0-9]+\]", captured_exc.format_oneline_tb()) def f(): def f2(): raise Exception("my bad again") try: f2() except Exception as e: # exception chain raise RuntimeError("new message") from e try: f() except Exception as e: captured_exc = CapturedException(e) # default limit: one level: estr1 = captured_exc.format_oneline_tb(limit=1) estr2 = captured_exc.format_oneline_tb(limit=2) # and we can control it via environ/config by default try: with patch.dict('os.environ', {'DATALAD_EXC_STR_TBLIMIT': '3'}): cfg.reload() estr3 = captured_exc.format_oneline_tb() with patch.dict('os.environ', {}, clear=True): cfg.reload() estr_ = captured_exc.format_oneline_tb() finally: cfg.reload() # make sure we don't have a side effect on other tests estr_full = captured_exc.format_oneline_tb(10) assert_re_in( r"new message \[test_captured_exception.py:test_CapturedException:[0-9]+,test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f2:[0-9]+\]", estr_full) assert_re_in( r"new message \[test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f2:[0-9]+\]", estr3) assert_re_in( r"new message \[test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f2:[0-9]+\]", estr2) assert_re_in(r"new message \[test_captured_exception.py:f2:[0-9]+\]", estr1) # default: no limit: assert_equal(estr_, estr_full) # standard output full_display = captured_exc.format_standard().splitlines() assert_equal(full_display[0], "Traceback (most recent call last):") # points in f and f2 for first exception with two lines each # (where is the line and what reads the line): assert_true(full_display[1].lstrip().startswith("File")) assert_equal(full_display[2].strip(), "f2()") assert_true(full_display[3].lstrip().startswith("File")) assert_equal(full_display[4].strip(), "raise Exception(\"my bad again\")") assert_equal(full_display[5].strip(), "Exception: my bad again") assert_equal( full_display[7].strip(), "The above exception was the direct cause of the following exception:") assert_equal(full_display[9], "Traceback (most recent call last):") # ... assert_equal(full_display[-1].strip(), "RuntimeError: new message") # CapturedException.__repr__: assert_re_in(r".*test_captured_exception.py:f2:[0-9]+\]$", captured_exc.__repr__())
def test_within_ds_file_search(path=None): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add('datalad.search.index-{}-documenttype'.format(m), 'all', scope='branch') ds.config.add('datalad.metadata.nativetype', 'audio', scope='branch') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.save() ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything ds.repo.set_metadata(opj('stim', 'stim1.mp3'), init={'importance': 'very'}) ds.aggregate_metadata() assert_repo_status(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', ): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # test default behavior while limiting set of keys reported with swallow_outputs() as cmo: ds.search([r'\.id', 'artist$'], show_keys='short') out_lines = [l for l in cmo.out.split(os.linesep) if l] # test that only the ones matching were returned assert_equal([l for l in out_lines if not l.startswith(' ')], ['audio.music-artist', 'datalad_core.id']) # more specific test which would also test formatting assert_equal( out_lines, [ 'audio.music-artist', ' in 1 datasets', " has 1 unique values: 'dlartist'", 'datalad_core.id', ' in 1 datasets', # we have them sorted " has 1 unique values: '%s'" % ds.id ]) with assert_raises(ValueError) as cme: ds.search('*wrong') assert_re_in( r"regular expression '\(\?i\)\*wrong' \(original: '\*wrong'\) is incorrect: ", str(cme.value)) # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), { 'type': 'file', 'audio.format': 'mp3' }), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # field selection by expression ('egrep', r'audio\.+:mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), { 'meta': 'mp3' }), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)
def test_dashh_clone(): # test -h on a sample command stdout, stderr = run_main(['clone', '-h']) assert_re_in(r'^Usage: .* clone \[', stdout.splitlines()[0]) assert_re_in('Use .--help. to get more comprehensive information', stdout.splitlines())
def assert_all_commands_present(out): """Helper to reuse to assert that all known commands are present in output """ for cmd in get_all_commands(): assert_re_in(fr"\b{cmd}\b", out, match=False)
def test_utils_suppress_similar(): tu = TestUtils() # Check suppression boundary for straight chain of similar # messages. # yield test results immediately to make test run fast sleep_dur = 0.0 def n_foo(number): for i in range(number): yield dict(action="foo", status="ok", path="path{}".format(i)) sleep(sleep_dur) with _swallow_outputs() as cmo: cmo.isatty = lambda: True list(tu(9, result_fn=n_foo, result_renderer="default")) assert_in("path8", cmo.out) assert_not_in("suppressed", cmo.out) with _swallow_outputs() as cmo: list(tu(10, result_fn=n_foo, result_renderer="default")) assert_in("path9", cmo.out) assert_not_in("suppressed", cmo.out) with _swallow_outputs() as cmo: list(tu(11, result_fn=n_foo, result_renderer="default")) assert_not_in("path10", cmo.out) assert_re_in(r"[^-0-9]1 .* suppressed", cmo.out, match=False) with _swallow_outputs() as cmo: # for this one test yield results slightly slower than 2Hz # such that we can see each individual suppression message # and no get caught by the rate limiter sleep_dur = 0.51 list(tu(13, result_fn=n_foo, result_renderer="default")) assert_not_in("path10", cmo.out) # We see an update for each result. assert_re_in(r"1 .* suppressed", cmo.out, match=False) assert_re_in(r"2 .* suppressed", cmo.out, match=False) assert_re_in(r"3 .* suppressed", cmo.out, match=False) # make tests run fast again sleep_dur = 0.0 with _swallow_outputs(isatty=False) as cmo: list(tu(11, result_fn=n_foo, result_renderer="default")) assert_in("path10", cmo.out) # Check a chain of similar messages, split in half by a distinct one. def n_foo_split_by_a_bar(number): half = number // 2 - 1 for i in range(number): yield dict(action="foo", status="ok", path="path{}".format(i)) if i == half: yield dict(action="bar", status="ok", path="path") with _swallow_outputs() as cmo: list(tu(20, result_fn=n_foo_split_by_a_bar, result_renderer="default")) assert_in("path10", cmo.out) assert_in("path19", cmo.out) assert_not_in("suppressed", cmo.out) with _swallow_outputs() as cmo: list(tu(21, result_fn=n_foo_split_by_a_bar, result_renderer="default")) assert_in("path10", cmo.out) assert_not_in("path20", cmo.out) assert_re_in("[^-0-9]1 .* suppressed", cmo.out, match=False)