def test_pipeline(self): data = [2, 4, 6] with patch('requests.request') as mock_request: mock_request.return_value.status_code = 200 mock_request.return_value.json.return_value = data class IncrementTransformer: def __init__(self, by): self.by = by def __call__(self, record): return record + self.by extract_stream = pipe.instream( extractor=HttpJSONExtractor, extractor_config={'url': "http://blah.com"}) transformer_stream1 = pipe.midstream( transformer=IncrementTransformer, transformer_config={'by': 3}) transformer_stream2 = pipe.midstream( transformer=IncrementTransformer, transformer_config={'by': 1}) loader_stream = pipe.outstream(StdoutLoader) pipe.flow(extract_stream, loader_stream, transformer_stream1, transformer_stream2) """
def test_extractor_http_json(self): data = [{"foo": "bar"}] with patch('requests.request') as mock_request: mock_request.return_value.status_code = 200 mock_request.return_value.json.return_value = data instream = pipe.instream( extractor=HttpJSONExtractor, extractor_config={'url': 'http://blah.com'}) for i, o in zip(data, instream): assert i == o
def test_extractor(self): data = [{"foo": "bar"}] instream = pipe.instream(extractor=EchoExtractor, extractor_config={ 'data': [{ 'foo': 'bar' }, { 'lorem': 'ipsum' }, { 45: 193 }] }) for i, o in zip(data, instream): assert i == o
from etl import pipe from etl.extractors import HttpJSONExtractor from etl.loaders import CSVLoader import os # Initialise instream that fetches data from API. # # instream by itself doesnt fetch data, it needs an extractor to do that. Extractors are callable classes that have # the logic of reading data from source and handing it over to instream in the form of an iterator. # # Our source data is JSON list of users from https://jsonplaceholder.typicode.com/users # etl-pipeline has a default extractor for reading JSON from REST API - HttpJSONExtractor # To initialise instream, we pass HttpJSONExtractor and provide parameters needed to initialize it. data_source_api = 'https://jsonplaceholder.typicode.com/users' instream = pipe.instream(extractor=HttpJSONExtractor, extractor_config={'url': data_source_api}) # Initialise outstream to load data coming from instream to CSV file. # # Just like instream, outstream doesn't load data by itself, it need a loader to do that. Loaders are callable classes # that have the logic of loading data into storage, outstream provides loaders with data, one record at a time. # # We will load the incoming data to a CSV file. # etl-pipeline has a default loader for loading data in CSV file. # To initialise outstream, we pass CSVLoader class and provide parameters needed to initialize it. filepath = "%s/'simple_transfer.csv'" % os.path.dirname(__file__) headers = [ 'id', 'name', 'username', 'email', 'address', 'phone', 'website', 'company' ] outstream = pipe.outstream(loader=CSVLoader,
from etl import pipe from etl.extractors import CSVExtractor from etl.transformers import CsvToDictTransformer from etl.loaders import MongodbLoader import os # Initialise instream that fetches data from local CSV file. # # instream by itself doesnt fetch data, it needs an extractor to do that. Extractors are callable classes that have # the logic of reading data from source and handing it over to instream in the form of an iterator. # # Our source data is CSV list of users from a local CSV file. # etl-pipeline has a default extractor for reading CSV files. # To initialise instream, we pass CSVExtractor and provide parameters needed to initialize it. source_filepath = "%s/csv_to_mongo_source.csv" % os.path.dirname(__file__) instream = pipe.instream(extractor=CSVExtractor, extractor_config={'csv_file_path': source_filepath}) # Initialise mid-stream that transforms CSV record to a dict. # # midstream by itself doesnt transform data, it needs a transformer to do that. Transformers are callable classes that # have the logic of transforming incoming data and handing it over to midstream. # # etl-pipeline has a default transformer for transforming CSV record to a dict. # To initialise midstream, we pass CsvToDictTransformer and provide parameters needed to initialize it. headers = ['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address'] midstream = pipe.midstream(transformer=CsvToDictTransformer, transformer_config={'headers': headers}) # Initialise outstream to load data coming from instream to a MongoDB collection. #