class GitEvents(object): client = Client(settings.GITHUB_HOST + '/api/v3/events') # the etag for the last call to github etag = '' # the last event id to be pulled from github last_event = None def get_page_of_events(self, page=1, etag=True): """ return a page (1-10) of events. if etag is True, will check etag version """ headers = {'If-None-Match': self.etag} if etag else {} resp = self.client.get(headers=headers, params={"page": page}) if etag: self.etag = resp.headers.get('ETag', self.etag) if resp.status_code == 200: return resp.json() elif resp.status_code == 304: return [] def get_changed_page_urls(self): """ return the urls for all pages changed since the last time get_changed_page_urls was called. Uses a combination of etag and the last synced event id to minimize (hopefully eliminate) duplication. """ data = self.get_page_of_events() if not data: return data newest_last_event = int(data[0]['id']) intermediate_last_event = int(data[-1]['id']) pages = range(2, 11) for page in pages: if intermediate_last_event <= self.last_event: break data += self.get_page_of_events(page=page, etag=False) intermediate_last_event = int(data[-1]['id']) #get the pages changed for gollumEvents that happened after the last sync page_lists = [ event['pages'] for event in data if event['type'] == 'gollumEvent' and int(event['id']) > self.last_event ] # each event can have multiple pages changed, so flatten pages = [item for sublist in page_lists for item in sublist] # flatten the lists of pages urls = [page['html_url'] for page in pages] urls = list(set(urls)) # dedup # update the last_event counter self.last_event = newest_last_event return urls
def update_index(): es = Client('http://localhost:9200') es.dc.DELETE() with open(schema_file) as f: es.dc.POST(data=f.read()) with open(index_file) as f: while True: lines = f.readlines(10000000) if not lines: break resp = es._bulk.POST(data=''.join(lines))
def GitHub(user_model=None, access_token=None): """ return a UniversalClient client for GitHub, authenticated with the given access_token. If access_token is not passed, will look for the access_token associated with the user_model. """ if not access_token: try: access_token = user_model.social_auth.get(provider='github').tokens except: return None session = rauth.OAuth2Session(gh_client_key, gh_client_secret, access_token) return Client(gh_host, oauth=session, dataFilter=jsonFilter)
def GitHubEnterprise(user_model=None, access_token=None): """ return a UniversalClient client for GitHub Enterprise, authenticated with the given access_token. If access_token is not passed, will look for the access_token associated with the user_model. """ if not access_token: if not user_model.is_authenticated(): return None access_token = user_model.social_auth.get( provider='github-enterprise').tokens session = rauth.OAuth2Session(ghe_client_key, ghe_client_secret, access_token) return Client(ghe_host + '/api/v3', oauth=session, dataFilter=jsonFilter)
from universalclient import Client import urllib3 from server import utils import itertools from gh_wiki import index as gh_wiki from gh_readme import index as gh_readme from gh_pages import index as gh_pages from gh_issues import index as gh_issues headers = { 'keep_alive': True, 'user_agent': 'cfpb-tiresias', } gh_api_client = Client(gh_settings.get('API')) if 'AUTH' in gh_settings: gh_api_client = gh_api_client.auth(gh_settings['AUTH']) headers['basic_auth'] = '%s:%s' % gh_settings['AUTH'] gh_pool = urllib3.connection_from_url(gh_settings.get('WEB'), maxsize=50, block=True) gh_api_pool = urllib3.connection_from_url( gh_settings.get('API'), maxsize=50, block=True, headers=urllib3.util.make_headers(**headers))
# encoding: utf-8 from __future__ import absolute_import from functools import wraps import datetime from dateutil.tz import tzutc from django.conf import settings from universalclient import Client # Meetup API MEETUP = Client("http://api.meetup.com").setArgs(params={"key": settings.MEETUP_API_KEY}) # Upcoming events UPCOMING_EVENTS = MEETUP._('2').events.setArgs(params={"group_urlname": "dcpython"}) # Past events PAST_EVENTS = MEETUP._('2').events.setArgs(params={"group_urlname": "dcpython", "status": "past"}) # Via https://github.com/pythonkc/pythonkc-meetups/blob/master/pythonkc_meetups/parsers.py#L102 def parse_datetime_ms(utc_timestamp_ms, utc_offset_ms=None): """ Create a timezone-aware ``datetime.datetime`` from the given UTC timestamp (in milliseconds), if provided. If an offest it given, it is applied to the datetime returned. Parameters ---------- utc_timestamp_ms UTC timestamp in milliseconds.
from django.conf import settings from universalclient import Client, jsonFilter import requests import json kratos = Client(settings.KRATOS_URL, dataFilter=jsonFilter, auth=('admin', settings.KRATOS_ADMIN_PWD), headers={'Content-Type': 'application/json'}) def register_kratos(request, response, user, **kwargs): if not user or not user.gh_id: return social_auth = user.social_auth.get(provider='github').extra_data kratos_data = kratos.users.get(params={'gh': user.gh_id}).json() if kratos_data.get('error') == 'not_found': kratos_user = { "data": { "username": user.username, "contractor": user.contractor, }, "roles": ["gh|user", "kratos|enabled"], "rsrcs": { "gh": { "username": social_auth['username'], "id": social_auth['id'], }, }, }
from universalclient import Client, jsonFilter import json, click, re, os DIR = os.path.abspath(os.path.dirname(__file__)) lims_data_path = os.path.join(DIR, 'lims_data.json') lims = Client('http://lims.dccouncil.us/api/v1/', headers={'Content-Type': 'application/json'}, dataFilter=jsonFilter) leg_details = lims.Legislation.Details._('{}') leg_search = lims.Legislation.AdvancedSearch._('100/{}').data( {"LegislationStatus": "130"}) out = json.load(open(lims_data_path)) # offset = int(len(out)/100) # while True: # leg_data = leg_search.POST(offset).json() # print('getting:', offset*100, '-', offset*100+len(leg_data)) # with click.progressbar(leg_data) as leg_data_bar: # for leg_datum in leg_data_bar: # leg_number = leg_datum['LegislationNumber'] # leg = leg_details.GET(leg_number).json() # out[leg_number] = leg # offset += 1 # json.dump(out, open(lims_data_path, 'w'), sort_keys=True, indent=2) # if not leg_data: # print('completed') # break
from universalclient import Client, jsonFilter import urllib3 from server import settings from gevent import subprocess from server import schemas import json import bs4 import re es_client = Client(settings.ES_HOST, dataFilter=jsonFilter) es_pool = urllib3.connection_from_url( settings.ES_HOST, maxsize=50, block=True, headers=urllib3.util.make_headers(keep_alive=True)) history_index = 'history' search_index = 'search' search_client = es_client.search history_client = es_client.history def save_indexed_version(gh_type, repo_name, typ, version): doc_id = (gh_type + '/' + repo_name).replace('/', '%2F') body = json.dumps({'version': version}) url = '/%s/%s/%s/_update' % (history_index, typ, doc_id) resp = es_pool.urlopen('POST', url, body=body) if resp.status == 500: url = '/%s/%s/%s' % (history_index, typ, doc_id)
def GitHubAdmin(credentials=None): credentials = credentials or gh_admin_auth return Client(gh_host, auth=credentials, dataFilter=jsonFilter)
from server import settings import helpers from universalclient import Client import time from datetime import datetime jira_api_client = Client(settings.JIRA_HOST).rest.api._(2) jira_fields = 'assignee,creator,updated,project,status,summary,labels,description,comment' max_results = 500 def index(): """ sync all jira issues """ offset = 0 issues = [] start = time.mktime(datetime.now().timetuple()) # Grab all data via API calls, 500 issues at a time # TODO gevent solution while True: resp = jira_api_client.search.params(fields=jira_fields, startAt=offset, maxResults=max_results, ).get().json() issues += resp['issues'] if resp['total'] > len(issues): offset += max_results else:
from server import settings ghe_settings = settings.GITHUB.get('GHE', {}) from universalclient import Client import urllib3 from server import utils from wiki import index as wiki from readme import index as readme from gh_pages import index as gh_pages from gh_issues import index as gh_issues ghe_api_client = Client(ghe_settings.get('API')).api.v3 ghe_api_pool = urllib3.connection_from_url( ghe_settings.get('API'), maxsize=50, block=True, headers=urllib3.util.make_headers(keep_alive=True)) ghe_pool = urllib3.connection_from_url(ghe_settings.get('WEB'), maxsize=50, block=True) def get_repos(): if not ghe_settings: return [] return [ repo['full_name'] for repo in utils.iter_get_url( ghe_settings['API_PATH'] + '/repositories', ghe_api_pool) if not repo['fork']
from gevent import monkey from gevent.pool import Pool pool = Pool(50) # patches stdlib (including socket and ssl modules) to cooperate with other greenlets monkey.patch_all() import urllib2 from os import path from os.path import join as path_join DIR = path.dirname(path.realpath(__file__)) LOG = path_join(DIR, '..', 'client', 'dist', 'log') es_client = Client(settings.ES_HOST) gh_client = Client(settings.GITHUB_HOST) gh_api_client = gh_client.api.v3 whitespace_re = re.compile(r'(\W|\n)+') def extract_text_from_html(soup): text_nodes = soup.findAll(text=True) text_with_newlines = ' '.join(text_nodes) text = whitespace_re.sub(' ', text_with_newlines) return text def _get_soup(url, id): """ return generator that given a url, gets the content, parses it and returns a tuple of the urle, the repo name, and the soup of the tag with the given id