def test_datetime_transform(self): schema = {"type": "string", "format": "date-time"} string_datetime = "2017-01-01T00:00:00Z" transformed_string_datetime = "2017-01-01T00:00:00.000000Z" self.assertEqual( transformed_string_datetime, transform(string_datetime, schema, NO_INTEGER_DATETIME_PARSING)) self.assertEqual( '1970-01-02T00:00:00.000000Z', transform(86400, schema, UNIX_SECONDS_INTEGER_DATETIME_PARSING)) self.assertEqual( transformed_string_datetime, transform(string_datetime, schema, UNIX_SECONDS_INTEGER_DATETIME_PARSING)) self.assertEqual( '1970-01-01T00:01:26.400000Z', transform(86400, schema, UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)) self.assertEqual( transformed_string_datetime, transform(string_datetime, schema, UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)) trans = Transformer(NO_INTEGER_DATETIME_PARSING) self.assertIsNone(trans._transform_datetime('cat')) self.assertIsNone(trans._transform_datetime(0)) trans.integer_datetime_fmt = UNIX_SECONDS_INTEGER_DATETIME_PARSING self.assertIsNone(trans._transform_datetime('cat'))
def test_null_object_transform(self): schema = {"type": "object", "properties": {"addrs": {"type": ["null", "object"], "properties": {"city": {"type": "string"}}}}} none_data = {'addrs': None} self.assertDictEqual(none_data, transform(none_data, schema)) empty_data = {'addrs': {}} self.assertDictEqual(empty_data, transform(empty_data, schema))
def test_multi_type_array_transform(self): schema = {"type": ["null", "array", "integer"], "items": {"type": "date-time", "format": "date-time"}} data = ["2017-01-01"] expected = ["2017-01-01T00:00:00.000000Z"] self.assertEqual(expected, transform(data, schema)) data = 23 expected = 23 self.assertEqual(expected, transform(data, schema))
def test_multi_type_object_transform(self): schema = {"type": ["null", "object", "string"], "properties": {"whatever": {"type": "date-time", "format": "date-time"}}} data = {"whatever": "2017-01-01"} expected = {"whatever": "2017-01-01T00:00:00.000000Z"} self.assertDictEqual(expected, transform(data, schema)) data = "justastring" expected = "justastring" self.assertEqual(expected, transform(data, schema))
def test_anyof_datetime(self): schema = { 'anyOf': [{ 'type': 'null' }, { 'format': 'date-time', 'type': 'string' }] } string_datetime = '2016-03-10T18:47:20Z' transformed_string_datetime = '2016-03-10T18:47:20.000000Z' self.assertEqual(transformed_string_datetime, transform(string_datetime, schema)) self.assertIsNone(transform(None, schema))
def test_multi_type_array_transform(self): schema = { 'type': ['null', 'array', 'integer'], 'items': { 'type': 'date-time', 'format': 'date-time' } } data = ['2017-01-01'] expected = ['2017-01-01T00:00:00.000000Z'] self.assertEqual(expected, transform(data, schema)) data = 23 expected = 23 self.assertEqual(expected, transform(data, schema))
def test_nested_transform(self): schema = { 'type': 'object', 'properties': { 'addrs': { 'type': 'array', 'items': { 'type': 'object', 'properties': { 'addr1': { 'type': 'string' }, 'city': { 'type': 'string' }, 'state': { 'type': 'string' }, 'amount': { 'type': 'integer' } } } } } } data = {'addrs': [{'amount': '123'}, {'amount': '456'}]} expected = {'addrs': [{'amount': 123}, {'amount': 456}]} self.assertDictEqual(expected, transform(data, schema))
def test_pattern_properties_match_multiple(self): schema = {"type": "object", "patternProperties": { ".+?cost": {"type": "number"}, ".+(?<!cost)$": {"type": "string"}}} dict_value = {"name": "chicken", "unit_cost": 1.45, "SKU": '123456'} expected = dict(dict_value) self.assertEqual(expected, transform(dict_value, schema))
def test_nested_transform(self): schema = { "type": "object", "properties": { "addrs": { "type": "array", "items": { "type": "object", "properties": { "addr1": { "type": "string" }, "city": { "type": "string" }, "state": { "type": "string" }, 'amount': { 'type': 'integer' } } } } } } data = {'addrs': [{'amount': '123'}, {'amount': '456'}]} expected = {'addrs': [{'amount': 123}, {'amount': 456}]} self.assertDictEqual(expected, transform(data, schema))
def get_all_pull_requests(stream, config, state): ''' https://developer.github.com/v3/pulls/#list-pull-requests ''' query = urllib.parse.urlencode({ 'state': 'all', 'sort': 'updated', 'direction': 'asc' }) repo = config[REPOSITORY] pr_state = state.get(PULL_REQUESTS) pr_state = dateutil.parser.parse(pr_state) if pr_state else _MIN_TS with metrics.record_counter(PULL_REQUESTS) as counter: url = 'https://api.github.com/repos/{}/pulls?{}'.format(repo, query) for response in authed_get_all_pages(PULL_REQUESTS, url): pull_requests = response.json() extraction_time = singer.utils.now() for pr in pull_requests: if dateutil.parser.parse(pr['updated_at']) > pr_state: rec = singer.transform(pr, stream) singer.write_record(PULL_REQUESTS, rec, time_extracted=extraction_time) counter.increment() # handle the case when there are no PRs to pull which means no pr object try: state[PULL_REQUESTS] = pr['updated_at'] except NameError: state[PULL_REQUESTS] = state.get(PULL_REQUESTS) return state
def _sync(self, ctx, path=None, product_id=None): if path is None: path = self.path if product_id: bookmark_name = 'product_{}.since_date'.format(product_id) else: bookmark_name = 'since_date' ctx.update_start_date_bookmark([self.tap_stream_id, bookmark_name]) schema = ctx.catalog.get_stream(self.tap_stream_id).schema.to_dict() page = 1 while True: params = self.get_params(ctx, page) opts = {"path": path, "params": params} resp = ctx.client.GET(self.version, opts, self.tap_stream_id) raw_records = self.format_response(resp) records = [transform(record, schema) for record in raw_records] if not self.on_batch_complete(ctx, records, product_id): break if len(records) == 0: break page += 1
def get_all_issues(stream, config, state): ''' https://developer.github.com/v3/issues/#list-issues-for-a-repository ''' repo_path = config[REPOSITORY] params = {'sort': 'updated', 'direction': 'asc'} if ISSUES in state and state[ISSUES] is not None: params['since'] = format(state[ISSUES]) query = urllib.parse.urlencode(params) url = 'https://api.github.com/repos/{}/issues?{}'.format(repo_path, query) with metrics.record_counter(ISSUES) as counter: for response in authed_get_all_pages(ISSUES, url): issues = response.json() extraction_time = singer.utils.now() for issue in issues: updated_at = dateutil.parser.parse(issue['updated_at']) ts_state = max(updated_at, _MIN_TS) rec = singer.transform(issue, stream) singer.write_record(ISSUES, rec, time_extracted=extraction_time) counter.increment() try: state[ISSUES] = ts_state.isoformat() except UnboundLocalError: state[ISSUES] = state.get(ISSUES) return state
def get_all_commits(stream, config, state): ''' https://developer.github.com/v3/repos/commits/#list-commits-on-a-repository ''' repo_path = config[REPOSITORY] query_string = '' if COMMITS in state and state[COMMITS] is not None: query_string = '?since={}'.format(state[COMMITS]) ts_state = _MIN_TS url = 'https://api.github.com/repos/{}/commits{}'.format( repo_path, query_string) with metrics.record_counter(COMMITS) as counter: for response in authed_get_all_pages(COMMITS, url): commits = response.json() extraction_time = singer.utils.now() for commit in commits: commit_date = dateutil.parser.parse( commit['commit']['author']['date']) ts_state = max(commit_date, ts_state) rec = singer.transform(commit, stream) singer.write_record(COMMITS, rec, time_extracted=extraction_time) counter.increment() try: ts_state = clean_tz(ts_state) except UnboundLocalError: ts_state = state.get(COMMITS) state[COMMITS] = ts_state return state
def get_all_stargazers(stream, config, state): ''' https://developer.github.com/v3/activity/starring/#list-stargazers ''' repo_path = config[REPOSITORY] params = {'sort': 'updated', 'direction': 'asc'} if state.get(STARGAZERS): params['since'] = state[STARGAZERS] query = urllib.parse.urlencode(params) stargazers_headers = {'Accept': 'application/vnd.github.v3.star+json'} url = 'https://api.github.com/repos/{}/stargazers?{}'.format( repo_path, query) ts_state = _MIN_TS with metrics.record_counter(STARGAZERS) as counter: for response in authed_get_all_pages(STARGAZERS, url, stargazers_headers): stargazers = response.json() extraction_time = singer.utils.now() for stargazer in stargazers: starred_at = dateutil.parser.parse(stargazer['starred_at']) ts_state = max(starred_at, ts_state) rec = singer.transform(stargazer, stream) rec['user_id'] = rec['user']['id'] singer.write_record(STARGAZERS, rec, time_extracted=extraction_time) counter.increment() try: state[STARGAZERS] = ts_state.isoformat() except UnboundLocalError: state[STARGAZERS] = state.get(STARGAZERS) return state
def test_multi_type_object_transform(self): schema = { 'type': ['null', 'object', 'string'], 'properties': { 'whatever': { 'type': 'date-time', 'format': 'date-time' } } } data = {'whatever': '2017-01-01'} expected = {'whatever': '2017-01-01T00:00:00.000000Z'} self.assertDictEqual(expected, transform(data, schema)) data = 'justastring' expected = 'justastring' self.assertEqual(expected, transform(data, schema))
def test_null_object_transform(self): schema = { 'type': 'object', 'properties': { 'addrs': { 'type': ['null', 'object'], 'properties': { 'city': { 'type': 'string' } } } } } none_data = {'addrs': None} self.assertDictEqual(none_data, transform(none_data, schema)) empty_data = {'addrs': {}} self.assertDictEqual(empty_data, transform(empty_data, schema))
def test_drops_fields_which_are_unsupported(self): schema = {"type": "object", "properties": {"name": {"type": "string"}}} metadata = {('properties', 'name'): {"inclusion": "unsupported"}} dict_value = {"name": "chicken"} self.assertEqual({}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
def test_keeps_fields_without_metadata(self): schema = {"type": "object", "properties": {"name": {"type": "string"}}} metadata = {('properties', 'age'): {"inclusion": "automatic"}} dict_value = {"name": "chicken"} self.assertEqual({"name": "chicken"}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
def test_keeps_selected_data_from_dicts(self): schema = {"type": "object", "properties": {"name": {"type": "string"}}} metadata = {('properties', 'name'): {"selected": True}} dict_value = {"name": "chicken"} self.assertEqual({"name": "chicken"}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
def test_keeps_selected_data_from_dicts(self): schema = {'type': 'object', 'properties': {'name': {'type': 'string'}}} metadata = {('properties', 'name'): {'selected': True}} dict_value = {'name': 'chicken'} self.assertEqual({'name': 'chicken'}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
def test_drops_fields_which_are_unsupported(self): schema = {'type': 'object', 'properties': {'name': {'type': 'string'}}} metadata = {('properties', 'name'): {'inclusion': 'unsupported'}} dict_value = {'name': 'chicken'} self.assertEqual({}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
def test_keeps_fields_without_metadata(self): schema = {'type': 'object', 'properties': {'name': {'type': 'string'}}} metadata = {('properties', 'age'): {'inclusion': 'automatic'}} dict_value = {'name': 'chicken'} self.assertEqual({'name': 'chicken'}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
def test_drops_no_data_when_not_dict(self): schema = {"type": "string"} metadata = {} string_value = "hello" self.assertEqual( string_value, transform(string_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
def test_drops_no_data_when_not_dict(self): schema = {'type': 'string'} metadata = {} string_value = 'hello' self.assertEqual( string_value, transform(string_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
def write_record(record, state, stream, replication_keys): stream_name = stream.tap_stream_id transformed = singer.transform(record, stream.schema.to_dict()) singer.write_record(stream_name, transformed) bookmark = get_replication_value(record, replication_keys) state = singer.write_bookmark(state=state, tap_stream_id=stream_name, key=replication_keys[-1], val=bookmark) singer.write_state(state)
def sync_file_ids(file_ids, client, state, stream, api, counter): if stream.get("replication_key"): start_date = state["bookmarks"][stream["tap_stream_id"]][ stream["replication_key"]] else: start_date = None while file_ids: file_id = file_ids.pop(0) try: lines = api.stream_file(client, file_id) except ApiException as ex: # If the file has been deleted, write state with "file_ids" removed and re-raise. # Don't advance the bookmark until all files in the window have been synced. if ex.resp.status_code == 404: state["bookmarks"][stream["tap_stream_id"]].pop( "file_ids", None) singer.write_state(state) raise Exception(( "File ID {} has been deleted, making the sync window invalid. " "Removing partially exported files from state and will resume from bookmark on the next extraction." ).format(file_id)) from ex raise header = parse_header_line(next(lines), stream["tap_stream_id"]) for line in lines: if not line: continue parsed_line = parse_csv_line(line) row = dict(zip(header, parsed_line)) record = transform(row, stream['schema']) if stream.get("replication_key"): bookmark = record.get(stream["replication_key"]) if not bookmark: # There's a chance we get back a bad record here, and we don't want to null the bookmark continue if bookmark and bookmark < start_date: continue singer.write_record(stream["tap_stream_id"], record) state["bookmarks"][stream["tap_stream_id"]][ stream["replication_key"]] = bookmark singer.write_state(state) else: singer.write_record(stream["tap_stream_id"], record) counter.increment() state["bookmarks"][stream["tap_stream_id"]]["file_ids"] = file_ids singer.write_state(state) state["bookmarks"][stream["tap_stream_id"]]["file_ids"] = None singer.write_state(state) return counter
def test_pattern_properties_match(self): schema = { 'type': 'object', 'patternProperties': { '.+': { 'type': 'string' } } } dict_value = {'name': 'chicken', 'unit_cost': '1.45', 'SKU': '123456'} expected = dict(dict_value) self.assertEqual(expected, transform(dict_value, schema))
def test_pattern_properties_match(self): schema = { "type": "object", "patternProperties": { ".+": { "type": "string" } } } dict_value = {"name": "chicken", "unit_cost": '1.45', "SKU": '123456'} expected = dict(dict_value) self.assertEqual(expected, transform(dict_value, schema))
def test_drops_nested_object_fields_which_are_unselected(self): schema = { "type": "object", "properties": { "addr": { "type": "object", "properties": { "addr1": { "type": "string" }, "city": { "type": "string" }, "state": { "type": "string" }, 'amount': { 'type': 'integer' } } } } } metadata = { ('properties', 'addr'): { "selected": True }, ('properties', 'addr', 'properties', 'amount'): { "selected": False } } data = { 'addr': { 'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123' } } expected = { 'addr': { 'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1' }, } self.assertDictEqual( expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
def test_pattern_properties_match_multiple(self): schema = { 'type': 'object', 'patternProperties': { '.+?cost': { 'type': 'number' }, '.+(?<!cost)$': { 'type': 'string' } } } dict_value = {'name': 'chicken', 'unit_cost': 1.45, 'SKU': '123456'} expected = dict(dict_value) self.assertEqual(expected, transform(dict_value, schema))