def ingest(event) -> Data: item = loads(event["body"]) repo = item.get('repository', None) if repo is None: return Response() @dataclass class GithubMetadata(Metadata): event: AnyStr def to_timestamp(date): return int(isoparse(date).timestamp()) if isinstance(date, str) else int(date) return Data( metadata=GithubMetadata( timestamp=datetime.now().timestamp(), event=event['headers']['X-GitHub-Event'] ), data={ 'id': repo.get('id', ''), 'name': repo.get('name', ''), 'description': repo.get('description', ''), 'url': repo.get('url', ''), 'html_url': repo.get('html_url', ''), 'owner': repo.get('owner', {}).get('login', ''), 'created_at': to_timestamp(repo.get('created_at', '')), 'updated_at': to_timestamp(repo.get('updated_at', '')), 'pushed_at': to_timestamp(repo.get('pushed_at', '')), 'language': repo.get('language', ''), 'forks_count': repo.get('forks_count', ''), 'stargazers_count': repo.get('stargazers_count', ''), 'default_branch': repo.get('default_branch', '') } )
def ingest(event) -> Data: event_body = event['body'] body_json = json.loads(event_body) return Data(Metadata(timestamp=int(datetime.now().timestamp())), data=body_json)
def ingest(event) -> Data: username, password, client, template_id = SSM(with_decryption=True).get( 'UBW_USERNAME', 'UBW_PASSWORD', 'UBW_CLIENT', 'UBW_TEMPLATE_ID') url = SSM().get('UBW_URL') soap_client = Client( wsdl=f'{url}?QueryEngineService/QueryEngineV200606DotNet') res = soap_client.service.GetTemplateResultAsXML(input={ 'TemplateId': template_id, 'TemplateResultOptions': { 'ShowDescriptions': True, 'Aggregated': True, 'OverrideAggregation': False, 'CalculateFormulas': True, 'FormatAlternativeBreakColumns': True, 'RemoveHiddenColumns': False, 'FirstRecord': -1, 'LastRecord': -1 } }, credentials={ 'Username': username, 'Client': client, 'Password': password, }) return Data(metadata=Metadata(timestamp=datetime.now().timestamp()), data=parse(res['TemplateResult'])['Agresso']['AgressoQE'])
def ingest(event) -> Data: api_token = SSM(with_decryption=True).get('github_api_token') res = requests.get(url, headers={'Authorization': f'Bearer {api_token}'}) repos = res.json() while 'next' in res.links.keys(): res = requests.get(res.links['next']['url']) repos.extend(res.json()) def to_timestamp(date): return int(isoparse(date).timestamp()) if isinstance( date, str) else int(date) def data_point(repo): # TODO: Move hard coding of values to another file? return { 'id': repo['id'], 'name': repo['name'], 'description': repo['description'], 'url': repo['url'], 'html_url': repo['html_url'], 'owner': repo['owner']['login'], 'created_at': to_timestamp(repo['created_at']), 'updated_at': to_timestamp(repo['updated_at']), 'pushed_at': to_timestamp(repo['pushed_at']), 'language': repo['language'], 'forks_count': repo['forks_count'], 'stargazers_count': repo['stargazers_count'], 'default_branch': repo['default_branch'] } return Data(metadata=Metadata(timestamp=datetime.now().timestamp()), data=[data_point(repo) for repo in repos])
def ingest(event) -> Union[Data, Response]: def remove_emoji_modifiers(emoji): return re.sub(r'::skin-tone-.', '', emoji) def get_channel_name(channel): slack_token = SSM(with_decryption=True).get('slack_app_token') res = requests.get('https://slack.com/api/channels.info', headers={'Authorization': f'Bearer {slack_token}'}, params={'channel': channel}) return res.json().get('channel', {}).get('name', None) data = loads(event['body']) slack_event = data['event'] channel = slack_event.get('channel', slack_event.get('item', {}).get('channel', '')) event_type = slack_event['type'] if not channel.startswith('C') or \ slack_event.get('subtype', '') == 'bot_message' or \ event_type not in ['reaction_added', 'message']: return Response() channel_name = get_channel_name(channel) if event_type == 'reaction_added': event_data = [{ 'event_type': event_type, 'channel': channel, 'channel_name': channel_name, 'event_ts': data['event_time'], 'team_id': data['team_id'], 'emoji': remove_emoji_modifiers(slack_event['reaction']), }] else: emoji_list = [ remove_emoji_modifiers(inner_element.get('name')) for block in slack_event.get('blocks', []) for element in block.get('elements', []) for inner_element in element.get('elements', []) if inner_element.get('type', '') == 'emoji' ] event_data = [{ 'event_type': event_type, 'channel': channel, 'channel_name': channel_name, 'event_ts': data["event_time"], 'team_id': data["team_id"], 'emoji': emoji, } for emoji in emoji_list] if not event_data: return Response() return Data(metadata=Metadata(timestamp=datetime.now().timestamp(), ), data=event_data)
def make_queue_event(data: schema.Data): s3_bucket.Object('/data/test.json').put( Body=data.to_json().encode('utf-8')) return { 'Records': [{ 'body': '/data/test.json', 'messageAttributes': { 's3FileName': { 'stringValue': '/data/test.json' } } }] }
def ingest(event) -> Data: timestamp_now = datetime.now().timestamp() d = [{ 'alias': 'olanor', 'test': 'This is a test message', 'id': 1, 'time_precise': str(datetime.now().timestamp()) }, { 'alias': 'karnor', 'test': 'This is also a test message', 'id': 2, 'time_precise': str(datetime.now().timestamp()) }] d2 = Data(metadata=Metadata(timestamp=int(timestamp_now)), data=d) return d2
def ingest(event) -> Data: register_or_update_webhooks() password, username = SSM(with_decryption=True).get('jira_sales_password', 'jira_sales_username') search_url = SSM(with_decryption=False).get('jira_sales_search_url') res = get(search_url, auth=HTTPBasicAuth(username, password), json={ 'jql': "project = SALG and status != 'Rejected'", 'fields': 'labels, status, created, updated' }) data = [ { 'issue': item['key'], 'customer': item['fields']['labels'][0] if len(item['fields']['labels']) > 0 else '', 'issue_status': item['fields']['status']['name'], 'created': int(isoparse(item['fields']['created']).timestamp()), 'updated': int(isoparse(item['fields']['updated']).timestamp()) } for item in res.json().get('issues', []) ] return Data(metadata=Metadata(timestamp=datetime.now().timestamp()), data=data)
def ingest(event) -> Data: body = loads(event['body']) event_type, item = body['webhookEvent'].split(':')[-1], body['issue'] @dataclass class JiraMetadata(Metadata): event_type: AnyStr return Data(metadata=JiraMetadata(timestamp=datetime.now().timestamp(), event_type=event_type), data={ 'issue': item['key'], 'customer': item['fields']['labels'][0] if len(item['fields']['labels']) > 0 else '', 'issue_status': item['fields']['status']['name'], 'created': int(isoparse(item['fields']['created']).timestamp()), 'updated': int(isoparse(item['fields']['updated']).timestamp()) })
def put(self, data: Data, path: str = ''): data_json = data.to_json().encode('utf-8') return self.put_raw(data_json, ext='json', path=path)
def ingest(event) -> Data: consumer_key, consumer_secret, access_token, access_secret = SSM( with_decryption=True).get('twitter_comsumer_key', 'twitter_comsumer_secret', 'twitter_access_token', 'twitter_access_secret') auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) api = API(auth) search_args = [ 'knowit', '"knowit objectnet"', '"knowit amende"', '"knowit solutions"', '"knowit experience"', '"knowit insight"', 'knowitab', 'knowitnorge', 'knowit norge', '"knowit stavanger"', 'knowit bergen', 'knowit oslo', 'knowit sverige', 'knowit norway', 'knowit sweden', 'knowit finland', 'knowitx' ] knowit_accounts = ['knowitnorge', 'knowitab', 'KnowitSuomi', 'knowitx'] def search_data(): search_result = np.hstack([[ item for item in Cursor(api.search, q=arg, lang='no' if arg == 'knowit' else None, tweet_mode='extended').items() if item.user.screen_name not in knowit_accounts ] for arg in search_args]) return [{ 'tweet_id': item.id, 'created_at': int(item.created_at.timestamp()), 'text': item.full_text, 'is_retweet': item.full_text.startswith('RT @'), 'favorite_count': item.favorite_count, 'retweet_count': item.retweet_count, 'language': item.lang, 'hashtags': as_separated_list(item.entities['hashtags'], 'text'), 'place': item.place.full_name if item.place else None, 'reply_to': item.in_reply_to_screen_name if item.in_reply_to_screen_name and item.in_reply_to_screen_name in knowit_accounts else None } for item in search_result] def timeline_data(): timeline_result = np.hstack([[ item for item in Cursor(api.user_timeline, screen_name=account, tweet_mode='extended').items() ] for account in knowit_accounts]) return [{ 'tweet_id': item.id, 'created_at': int(item.created_at.timestamp()), 'user_screen_name': item.user.screen_name, 'text': item.full_text, 'is_retweet': item.full_text.startswith('RT @'), 'favorite_count': item.favorite_count, 'retweet_count': item.retweet_count, 'language': item.lang, 'hashtags': as_separated_list(item.entities['hashtags'], 'text'), 'mentions': as_separated_list(item.entities['user_mentions'], 'screen_name'), 'user_name': item.user.name } for item in timeline_result] def account_data(): account_result = [ api.get_user(screen_name=account) for account in knowit_accounts ] return [{ 'user_id': item.id, 'screen_name': item.screen_name, 'name': item.name, 'statuses_count': item.statuses_count, 'followers_count': item.followers_count, 'favourites_count': item.favourites_count, 'friends_count': item.friends_count, 'listed_count': item.listed_count } for item in account_result] return Data(metadata=Metadata(timestamp=datetime.now().timestamp()), data={ 'search': search_data(), 'timeline': timeline_data(), 'accounts': account_data() })
def ingest(event) -> Data: def ubw_record_filter(record): if "tab" not in record or "reg_period" not in record: return False # Only the "B" documents are completed, the rest should be ignored. if record["tab"] != "B": return False # You should only uploads docs that are older than 4 weeks. year, week = record["reg_period"][0:4], record["reg_period"][4:] cur_year, cur_week = datetime.now().isocalendar()[0:2] number_of_weeks = int(year) * 52 + int(week) current_number_of_weeks = cur_year * 52 + cur_week if number_of_weeks > current_number_of_weeks - 4: return False return True username, password, client, template_id = SSM( with_decryption=True ).get('UBW_USERNAME', 'UBW_PASSWORD', 'UBW_CLIENT', 'UBW_TEMPLATE_ID') url = SSM().get('UBW_URL') soap_client = Client(wsdl=f'{url}?QueryEngineService/QueryEngineV200606DotNet') res = soap_client.service.GetTemplateResultAsXML( input={ 'TemplateId': template_id, 'TemplateResultOptions': { 'ShowDescriptions': True, 'Aggregated': True, 'OverrideAggregation': False, 'CalculateFormulas': True, 'FormatAlternativeBreakColumns': True, 'RemoveHiddenColumns': False, 'FirstRecord': -1, 'LastRecord': -1 }, 'SearchCriteriaPropertiesList': { 'SearchCriteriaProperties': [ { 'ColumnName': 'timecode', 'Description': 'Tidskode', 'RestrictionType': '!()', 'FromValue': "'X9'", 'DataType': 10, 'DataLength': 25, 'DataCase': 2, 'IsParameter': True, 'IsVisible': False, 'IsPrompt': False } ] }, }, credentials={ 'Username': username, 'Client': client, 'Password': password, }) ubw_data = parse(res['TemplateResult'])['Agresso']['AgressoQE'] return Data( metadata=Metadata(timestamp=datetime.now().timestamp()), data=[rec for rec in ubw_data if ubw_record_filter(rec)] )
def ingest(event) -> Data: return Data(metadata=Metadata(timestamp=datetime.now().timestamp()), data=None)
def ingest(event) -> Data: def scrape_archive_urls(): def scrape_url(url, pattern): response = requests.get(url) soup = BeautifulSoup(response.content, features='lxml') urls = list( set([ a['href'] for a in soup.find_all( href=lambda href: href and re.match(pattern, href)) ])) return urls if urls else [url] blog_year_urls = scrape_url( 'https://knowitlabs.no/archive', r'https:\/\/knowitlabs\.no\/archive\/\d{4}') return np.hstack([ scrape_url(url, r'https:\/\/knowitlabs\.no\/archive\/\d{4}\/\d{2}') for url in blog_year_urls ]) def scrape_article_data(url): response = requests.get(url) soup = BeautifulSoup(response.content, features='lxml') def map_content(content): json_content = json.loads( content[content.find('{'):content.rfind('}') + 1]) user_map = json_content.get('references', {}).get('User', {}) return [{ 'medium_id': post['id'], 'author_name': user_map.get(post['creatorId'], {}).get('name', ''), 'author_username': user_map.get(post['creatorId'], {}).get('username', ''), 'title': post['title'], 'created_at': int(float(post['createdAt']) / 1000.0), 'updated_at': int(float(post['updatedAt']) / 1000.0), 'first_published_at': int(float(post['firstPublishedAt']) / 1000.0), 'latest_published_at': int(float(post['latestPublishedAt']) / 1000.0), 'word_count': post['virtuals']['wordCount'], 'reading_time': post['virtuals']['readingTime'], 'total_claps': post['virtuals']['totalClapCount'], 'total_unique_claps': post['virtuals']['recommends'], 'language': post['detectedLanguage'], 'url': f'https://knowitlabs.no/{post["uniqueSlug"]}', 'comments_count': post['virtuals']['responsesCreatedCount'] } for post in json_content.get('references', {}).get( 'Post', {}).values()] contents = [ tag.string for tag in soup.find_all('script', string=lambda s: s and s.startswith( '// <![CDATA[\nwindow["obvInit"]({')) ] return np.hstack([map_content(content) for content in contents]) return Data(metadata=Metadata(timestamp=datetime.now().timestamp()), data=np.hstack([ scrape_article_data(url) for url in scrape_archive_urls() ]))
def ingest(event) -> Data: objectnet_id = SSM(with_decryption=False).get('cv_partner_objectnet_id') sor_id = SSM(with_decryption=False).get('cv_partner_sor_id') api_token = SSM(with_decryption=True).get('cv_partner_api_token') res = requests.get( f'{url}/search?office_ids[]={objectnet_id}&office_ids[]={sor_id}&offset=0&size={offset_size}', headers={'Authorization': f'Bearer {api_token}'}) data_json = res.json() empty_content_in_path(bucket=environ.get('PRIVATE_BUCKET'), prefix=environ.get('PRIVATE_PREFIX')) empty_content_in_path(bucket=environ.get('PUBLIC_BUCKET'), prefix=environ.get('PUBLIC_PREFIX')) def write_cv_doc_to_private_bucket(person, language: str = 'no', ext: str = 'pdf'): new_key = f'cv_{language}_{ext}' filename = f'{environ.get("PRIVATE_PREFIX")}/{uuid4()}.{ext}' http_request = { 'requestUrl': get_cv_link(person['cv']['user_id'], person['cv']['id'], language=language, ext=ext), 'header': { 'Authorization': f'Bearer {api_token}' }, } save_document(http_request, filename=filename, filetype=ext, private=True) return {new_key: filename} def write_cv_image_to_public_bucket(person, ext: str = 'jpg'): new_key = 'image_key' filename = f'{environ.get("PUBLIC_PREFIX")}/{uuid4()}.{ext}' http_request = {'requestUrl': person['cv']['image']['thumb']['url']} save_document(http_request, filename=filename, filetype=ext, private=False) return {new_key: filename} def get_cv_link(user_id, cv_id, language: str = 'no', ext: str = 'pdf'): return url_v1 + f"/cvs/download/{user_id}/{cv_id}/{language}/{ext}/" def get_person(person): d = { 'user_id': person['cv']['user_id'], 'default_cv_id': person['cv']['id'], 'cv_link': get_cv_link(person['cv']['user_id'], person['cv']['id'], language='{LANG}', ext='{FORMAT}') } d.update(write_cv_image_to_public_bucket(person)) d.update( write_cv_doc_to_private_bucket(person, language='no', ext='pdf')) d.update( write_cv_doc_to_private_bucket(person, language='int', ext='pdf')) d.update( write_cv_doc_to_private_bucket(person, language='no', ext='docx')) d.update( write_cv_doc_to_private_bucket(person, language='int', ext='docx')) return d def get_cv(user_id, cv_id): cv = requests.get(url + f'/cvs/{user_id}/{cv_id}', headers={'Authorization': f'Bearer {api_token}'}) return cv.json() def get_list_of_users(data): list_of_users = [] for person in data['cvs']: user = get_person(person) user['cv'] = get_cv(user['user_id'], user['default_cv_id']) list_of_users.append(user) return list_of_users return Data(metadata=Metadata(timestamp=datetime.now().timestamp()), data=get_list_of_users(data_json))