def index_repos_parallel(session, es): logging.info("Indexing repositories") page_num = 1 repos = repos_list = [] while True: page_params = {"page": str(page_num)} try: response = session.get(bb_config['repos_endpoint'], params=page_params) except ConnectionError: logging.error("Connection error! at page " + str(page_num)) if response.status_code == requests.codes.ok: repos = response.json()['values'] print len(repos) if len(repos) == 0: break repos_bulk = [] for repo in repos: repo = enhance_repo(session, repo) action = {} action.update({"_source": repo}) action.update({"_index" : es_config['index']}) action.update({"_type" : 'repo'}) repos_bulk.append(action) helpers.bulk(es, repos_bulk) repos_list.append(repos) logging.info(str(len(repos)) + " repos were just indexed") page_num += 1 elif response.status_code == 400: break else: logging.info("Indexing repos stopped with response code " + str(response.status_code)) break for num in range(len(repos_list)): Process(target=parallel_index_files, args=(repos_list[num], num)).start() logging.info("Started process num: " + str(num))
def index_repos_parallel(session, es): logging.info("Indexing repositories") page_num = 1 repos = repos_list = [] while True: page_params = {"page": str(page_num)} try: response = session.get(bb_config['repos_endpoint'], params=page_params) except ConnectionError: logging.error("Connection error! at page " + str(page_num)) if response.status_code == requests.codes.ok: repos = response.json()['values'] print len(repos) if len(repos) == 0: break repos_bulk = [] for repo in repos: repo = enhance_repo(session, repo) action = {} action.update({"_source": repo}) action.update({"_index": es_config['index']}) action.update({"_type": 'repo'}) repos_bulk.append(action) helpers.bulk(es, repos_bulk) repos_list.append(repos) logging.info(str(len(repos)) + " repos were just indexed") page_num += 1 elif response.status_code == 400: break else: logging.info("Indexing repos stopped with response code " + str(response.status_code)) break for num in range(len(repos_list)): Process(target=parallel_index_files, args=(repos_list[num], num)).start() logging.info("Started process num: " + str(num))
def update_repos(session, es, since): page_num = 1 updated_repos = [] size = 0 while True: page_params = {"page": str(page_num)} repos = session.get(bb_config['repos_endpoint'], params=page_params).json() if 'values' not in repos: if (size == 0) or (page_num * 10 < size): logging.error("Error in calling " + bb_config['repos_endpoint']) logging.error("Please check your bitbucket.conf file") exit(1) logging.info("Checked all repos") break else: size = repos['size'] repos = repos['values'] for repo in repos: repo_updated_on = time.strptime(repo["updated_on"].split(".")[0], '%Y-%m-%dT%H:%M:%S') if (since < repo_updated_on): repo = enhance_repo(session, repo) old_repo = es.search(index=es_config['index'], body={"query":{ "match_phrase":{"full_name": repo['full_name'] }}}) # if the repo already exists, update it if len(old_repo['hits']['hits']) > 0: logging.info(repo["full_name"] + " - Repo already exists, updating it") repo_id = old_repo['hits']['hits'][0]['_id'] es.index(index=es_config['index'], doc_type="repo", id=repo_id, body=repo) updated_repos.append(repo) # if not, index it and index its files else: es.index(index=es_config['index'], doc_type="repo", body=repo) index_files(session, es, repo) page_num += 1 logging.info(str(len(updated_repos)) + " updated repos were found") update_files(session, es, updated_repos, since)