def test_input_stream(self): task.push(FakeTask()) input_stream = datasources.input_stream_for(None, 0, 'repo://dir1/doc1.txt', None) eq_('Hi mom!', input_stream[0].next()) task.pop()
def test_input_stream(self): task.push(FakeTask()) input_stream = datasources.input_stream_for(None, 0, 'repo://dir1/doc1.txt', None) eq_('Hi mom!', input_stream.next().read()) task.pop()
def test_http(self): url = 'http://google.com/' source = datasources.source_for(url) assert isinstance(source, HTTPSource) urls = source.segment_between(datetime(2011,5,31), datetime(2011,6,1)) eq_(len(urls),1) params = Params() input_stream = datasources.input_stream_for(None, None, urls[0], params)
def test_http(self): url = 'http://google.com/' source = datasources.source_for(url) assert isinstance(source, HTTPSource) urls = source.segment_between(datetime(2011, 5, 31), datetime(2011, 6, 1)) eq_(len(urls), 1) params = Params() input_stream = datasources.input_stream_for(None, None, urls[0], params)
def test_look_order(self): """ Ensure input_stream_for returns None if a url/mimetype combo that hasn't been registered is used. """ url = "bogus-scheme://example.com/foo" params = Params() params.content_type = "application/not-registered" stream = datasources.input_stream_for(None, None, url, params) eq_(stream, None)
def map_input_stream(stream, size, url, params): """ Looks up an input stream if one is registered, if not falls back to disco's defaults. """ # achtung! warning! when this function is called by the disco # node the globals in this module will no longer be visible # hence why we access everything through datasources from disco.util import schemesplit import disco.func from triv.io import datasources from triv.io.task import task # Note: Task is a global set by disco, but not necsarrily seen by other object, # we push it onto the context stack which will allow it to be imported by our # modules that need it try: task.push(Task) except NameError: # it's a test pass input_stream = datasources.input_stream_for(stream, size, url, params) if not input_stream: # we don't handle the given url, see if vanilla disco moduels can... try: # this is normally cleared when we're done iterating task.pop() except IndexError: pass input_stream = disco.func.map_input_stream(stream,size,url,params) # same code in classic/worker... if isinstance(input_stream, tuple): if len(input_stream) == 3: input_stream, size, url = input_stream else: input_stream, url = input_stream if hasattr(params, 'content_type'): input_stream = datasources.reader_for_mimetype(params.content_type)(input_stream,size,url,params) print "using input stream {}".format(input_stream) return input_stream
def map_input_stream(stream, size, url, params): from disco.util import schemesplit import disco.func from triv.io import datasources, task datasources.load() task.push(Task) input_stream = datasources.input_stream_for(stream, size, url, params) if input_stream: # Note: Task is a global set by disco, we push it onto the context stap # which will allow it to be imported by the modules that need it return input_stream else: # we don't handle the given url, see if vanilla disco moduels can task.pop() # this is normally cleared when we're done iterating return disco.func.map_input_stream(stream,size,url,params)
def map_input_stream(stream, size, url, params): from disco.util import schemesplit import disco.func from triv.io import datasources, task datasources.load() task.push(Task) input_stream = datasources.input_stream_for(stream, size, url, params) if input_stream: # Note: Task is a global set by disco, we push it onto the context stap # which will allow it to be imported by the modules that need it return input_stream else: # we don't handle the given url, see if vanilla disco moduels can task.pop() # this is normally cleared when we're done iterating return disco.func.map_input_stream(stream, size, url, params)
def __test_warc_mime_type(self): params = Params() input_stream = datasources.input_stream_for(None, None, segment[0], params)
def __test_mock_url_stream(self): input_stream = datasources.input_stream_for(None, 0, urls[0], None) record = input_stream.next() self.assertSequenceEqual(record, (0, ['1', '2', '3']))