def setUpClass(cls): cls.con = index.Connector(index='repoxplorertest') cls.conp = index.Connector( index='repoxplorertest', index_suffix='projects') c = Commits(cls.con) c.add_commits(COMMITS) cls.db = set_projects_definition(cls.conp) t = Tags(index.Connector( index='repoxplorertest', index_suffix='tags')) tags = [ { 'sha': '3597334f2cb10772950c97ddf2f6cc17b184', 'date': 1410456005, 'repo': 'https://github.com/nakata/monkey.git:monkey', 'name': 'tag1', }, { 'sha': '3597334f2cb10772950c97ddf2f6cc17b1845', 'date': 1410456005, 'repo': 'https://github.com/nakata/monkey.git:monkey', 'name': 'tag2', }] t.add_tags(tags)
class RepoIndexer(): def __init__(self, name, uri, parsers=None, con=None, config=None): if config: configuration.set_config(config) if not con: self.con = index.Connector() else: self.con = con self.c = Commits(self.con) self.t = Tags(self.con) if not os.path.isdir(conf.git_store): os.makedirs(conf.git_store) self.name = name self.uri = uri self.base_id = '%s:%s' % (self.uri, self.name) self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED) if not parsers: self.parsers = [] else: self.parsers = parsers self.parsers_compiled = False self.local = os.path.join(conf.git_store, self.name, self.uri.replace('/', '_')) if not os.path.isdir(self.local): os.makedirs(self.local) self.credentials_helper_path = os.path.join( sys.prefix, 'bin', 'repoxplorer-git-credentials-helper') def __str__(self): return 'Git indexer of %s' % self.ref_id def save_seen_ref_in_cache(self): # Keep a cache a each ref that have been indexed # This is use later to discover seen refs no longer in projects.yaml # In that case a removal from the backend will be performed logger.debug("Save ref %s into seen_refs file" % self.ref_id) if not os.path.isfile(self.seen_refs_path): data = set() else: try: data = cPickle.load(file(self.seen_refs_path)) except Exception: # Protect against corrupted file data = set() data.add(self.ref_id) cPickle.dump(data, file(self.seen_refs_path, 'w')) def set_branch(self, branch): self.branch = branch self.ref_id = '%s:%s:%s' % (self.uri, self.name, self.branch) self.save_seen_ref_in_cache() def git_init(self): logger.debug("Git init for %s:%s in %s" % (self.uri, self.name, self.local)) run(["git", "init", "--bare", "."], self.local) if "origin" not in run(["git", "remote", "-v"], self.local): run(["git", "remote", "add", "origin", self.uri], self.local) def git_fetch_branch(self): logger.debug("Fetch %s %s:%s" % (self.name, self.uri, self.branch)) run([ "git", "-c", "credential.helper=%s" % self.credentials_helper_path, "fetch", "-nk", "origin", "+%s:%s" % (self.branch, self.branch) ], self.local) def get_refs(self): refs = run([ "git", "-c", "credential.helper=%s" % self.credentials_helper_path, "ls-remote", "origin" ], self.local).splitlines() self.refs = [] for r in refs: self.refs.append(r.split('\t')) def get_heads(self): self.heads = filter(lambda x: x[1].startswith('refs/heads/'), self.refs) def get_tags(self): self.tags = filter(lambda x: x[1].startswith('refs/tags/'), self.refs) def git_get_commit_obj(self): self.commits = get_all_shas(self.local) def run_workers(self, shas, workers): BULK_CHUNK = 1000 to_process = [] if workers == 0: # Default value (auto) workers = mp.cpu_count() - 1 or 1 while True: try: shas[BULK_CHUNK] to_process.append(shas[:BULK_CHUNK]) del shas[:BULK_CHUNK] except IndexError: # Add the rest to_process.append(shas) break options = [(self.local, self.ref_id, stp) for stp in to_process] worker_pool = mp.Pool(workers) worker_pool.map(process_commits, options) worker_pool.terminate() worker_pool.join() def is_branch_fully_indexed(self): branch = [ head for head in self.heads if head[1].endswith(self.branch) ][0] branch_tip_sha = branch[0] cmt = self.c.get_commit(branch_tip_sha, silent=True) if cmt and self.ref_id in cmt['repos']: return True return False def get_current_commit_indexed(self): """ Fetch from the index commits mentionned for this repo and branch. """ self.already_indexed = [ c['_id'] for c in self.c.get_commits(repos=[self.ref_id], scan=True) ] logger.debug( "%s: In the DB - repo history is composed of %s commits." % (self.name, len(self.already_indexed))) def compute_to_index_to_delete(self): """ Compute the list of commits (sha) to index and the list to delete from the index. """ logger.debug("%s: Upstream - repo history is composed of %s commits." % (self.name, len(self.commits))) self.to_delete = set(self.already_indexed) - set(self.commits) self.to_index = set(self.commits) - set(self.already_indexed) logger.debug("%s: Indexer will reference %s commits." % (self.name, len(self.to_index))) logger.debug("%s: Indexer will dereference %s commits." % (self.name, len(self.to_delete))) def compute_to_create_to_update(self): if self.to_index: res = self.c.get_commits_by_id(list(self.to_index)) to_update = [ c['_source'] for c in res['docs'] if c['found'] is True ] to_create = [c['_id'] for c in res['docs'] if c['found'] is False] return to_create, to_update return [], [] def index_tags(self): def c_tid(t): return "%s%s%s" % (t['sha'], t['name'].replace('refs/tags/', ''), t['repo']) if not self.tags: logger.debug('%s: no tags detected for this repository' % (self.name)) return logger.debug('%s: %s tags exist upstream' % (self.name, len(self.tags))) tags = self.t.get_tags([self.base_id]) existing = dict([(c_tid(t['_source']), t['_id']) for t in tags]) logger.debug('%s: %s tags already referenced' % (self.name, len(existing))) # Some commits may be not found because it is possible the branches # has not been indexed. commits = [ c['_source'] for c in self.c.get_commits_by_id([t[0] for t in self.tags])['docs'] if c['found'] ] lookup = dict([(c['sha'], c['committer_date']) for c in commits]) to_delete = [ v for k, v in existing.items() if k not in [ "%s%s%s" % (sha, name.replace('refs/tags/', '').replace('^{}', ''), self.base_id) for sha, name in self.tags ] ] docs = [] for sha, name in self.tags: if sha in lookup: doc = {} doc['name'] = name.replace('refs/tags/', '').replace('^{}', '') doc['sha'] = sha doc['date'] = lookup[sha] doc['repo'] = self.base_id if c_tid(doc) in existing: continue docs.append(doc) if docs: logger.info('%s: %s tags will be indexed' % (self.name, len(docs))) self.t.add_tags(docs) if to_delete: logger.info('%s: %s tags will be deleted' % (self.name, len(to_delete))) self.t.del_tags(to_delete) def index(self, extract_workers=1): # Compile the parsers if self.parsers: if not self.parsers_compiled: raw_parsers = copy.deepcopy(self.parsers) self.parsers = [] for parser in raw_parsers: self.parsers.append(re.compile(parser)) logger.debug("%s: Prepared %s regex parsers for commit msgs" % (self.name, len(self.parsers))) self.parsers_compiled = True # check whether a commit should be completly deleted or # updated by removing the repo from the repos field if self.to_delete: delete_commits(self.c, self.name, self.to_delete, self.ref_id) # check whether a commit should be created or # updated by adding the repo into the repos field if self.to_index: to_create, to_update = self.compute_to_create_to_update() if to_create: logger.info("%s: %s commits will be created ..." % (self.name, len(to_create))) self.run_workers(to_create, extract_workers) if to_update: logger.info( "%s: %s commits already indexed and need to be updated" % (self.name, len(to_update))) for c in to_update: c['repos'].append(self.ref_id) self.c.update_commits(to_update)
class RepoIndexer(): def __init__(self, name, uri, parsers=None, con=None, meta_ref=None): if not con: self.con = index.Connector() else: self.con = con self.c = Commits(self.con) self.t = Tags(index.Connector( index=self.con.index, index_suffix='tags')) if not os.path.isdir(conf.git_store): os.makedirs(conf.git_store) self.name = name self.uri = uri self.base_id = '%s:%s' % (self.uri, self.name) self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED) if meta_ref: self.meta_ref = 'meta_ref: %s' % meta_ref else: self.meta_ref = None if not parsers: self.parsers = [] else: self.parsers = parsers self.parsers_compiled = False self.local = os.path.join(conf.git_store, self.name, self.uri.replace('/', '_')) if not os.path.isdir(self.local): os.makedirs(self.local) self.credentials_helper_path = getattr( conf, 'git_credential_helper_path', None) if not (self.credentials_helper_path and self.credentials_helper_path.startswith('/') and os.path.isfile(self.credentials_helper_path)): if self.credentials_helper_path: logger.warning( 'Configured git_credential_helper %s not found' % ( self.credentials_helper_path)) self.credentials_helper_path = None # Look at the default installation pathes if not self.credentials_helper_path: self.credentials_helper_path = os.path.join( sys.prefix, 'bin', 'repoxplorer-git-credentials-helper') if not os.path.isfile(self.credentials_helper_path): self.credentials_helper_path = shutil.which( 'repoxplorer-git-credentials-helper') if not self.credentials_helper_path: logger.warning( 'Default repoxplorer-git-credential-helper command ' 'not found') def __str__(self): return 'Git indexer of %s' % self.ref_id def save_seen_ref_in_cache(self): # Keep a cache a each ref that have been indexed # This is use later to discover seen refs no longer in projects.yaml # In that case a removal from the backend will be performed logger.debug("Save ref %s into seen_refs file" % self.ref_id) if not os.path.isfile(self.seen_refs_path): data = set() else: try: data = pickle.load(open(self.seen_refs_path, 'rb')) except Exception: # Protect against corrupted file data = set() data.add(self.ref_id) pickle.dump(data, open(self.seen_refs_path, 'wb')) def set_branch(self, branch): self.branch = branch self.ref_id = '%s:%s:%s' % (self.uri, self.name, self.branch) self.save_seen_ref_in_cache() def git_init(self): logger.debug("Git init for %s:%s in %s" % ( self.uri, self.name, self.local)) run(["git", "init", "--bare", "."], self.local) remotes = run(["git", "remote", "-v"], self.local) remote_names = [line.split()[0] for line in remotes.splitlines()] if "origin" not in remote_names: run(["git", "remote", "add", "origin", self.uri], self.local) def git_fetch_branch(self): logger.debug("Fetch %s %s:%s" % (self.name, self.uri, self.branch)) run(["git", "-c", "credential.helper=%s" % self.credentials_helper_path, "fetch", "-nk", "origin", "+%s:%s" % (self.branch, self.branch)], self.local) def get_refs(self): refs = run([ "git", "-c", "credential.helper=%s" % self.credentials_helper_path, "ls-remote", "origin"], self.local).splitlines() self.refs = [] for r in refs: self.refs.append(r.split('\t')) def get_heads(self): self.heads = [x for x in self.refs if x[1].startswith('refs/heads/')] def get_tags(self): self.tags = [x for x in self.refs if x[1].startswith('refs/tags/')] def git_get_commit_obj(self): self.commits = get_all_shas(self.local) def run_workers(self, shas, workers): BULK_CHUNK = 1000 to_process = [] if workers == 0: # Default value (auto) workers = mp.cpu_count() - 1 or 1 while True: try: shas[BULK_CHUNK] to_process.append(shas[:BULK_CHUNK]) del shas[:BULK_CHUNK] except IndexError: # Add the rest to_process.append(shas) break ref_ids = [self.ref_id] if self.meta_ref: ref_ids.append(self.meta_ref) options = [ (self.local, ref_ids, stp) for stp in to_process] worker_pool = mp.Pool(workers) worker_pool.map(process_commits, options) worker_pool.terminate() worker_pool.join() def is_branch_fully_indexed(self): branch = [head for head in self.heads if head[1].endswith(self.branch)][0] branch_tip_sha = branch[0] _, _, cmts_list = self.c.get_commits(repos=[self.ref_id], limit=1) if not cmts_list: return False cmt = cmts_list[0] if branch_tip_sha != cmt['sha']: return False return True def get_current_commits_indexed(self): """ Fetch from the index commits mentionned for this repo and branch. """ self.already_indexed = [c['_id'] for c in self.c.get_commits(repos=[self.ref_id], scan=True)] logger.debug( "%s: In the DB - repo history is composed of %s commits." % ( self.name, len(self.already_indexed))) def compute_to_index_to_delete(self): """ Compute the list of commits (sha) to index and the list to delete from the index. """ logger.debug( "%s: Upstream - repo history is composed of %s commits." % ( self.name, len(self.commits))) self.to_delete = set(self.already_indexed) - set(self.commits) self.to_index = set(self.commits) - set(self.already_indexed) logger.debug( "%s: Indexer will reference %s commits." % ( self.name, len(self.to_index))) logger.debug( "%s: Indexer will dereference %s commits." % ( self.name, len(self.to_delete))) def compute_to_create_to_update(self): if self.to_index: res = self.c.get_commits_by_id(list(self.to_index)) to_update = [c['_source'] for c in res['docs'] if c['found'] is True] to_create = [c['_id'] for c in res['docs'] if c['found'] is False] return to_create, to_update return [], [] def index_tags(self): def c_tid(t): return "%s%s%s" % (t['sha'], t['name'].replace('refs/tags/', ''), t['repo']) if not self.tags: logger.debug('%s: no tags detected for this repository' % ( self.name)) return logger.debug('%s: %s tags exist upstream' % ( self.name, len(self.tags))) tags = self.t.get_tags([self.base_id]) existing = dict([(c_tid(t['_source']), t['_id']) for t in tags]) logger.debug('%s: %s tags already referenced' % ( self.name, len(existing))) # Some commits may be not found because it is possible the branches # has not been indexed. commits = [c['_source'] for c in self.c.get_commits_by_id( [t[0] for t in self.tags])['docs'] if c['found']] lookup = dict([(c['sha'], c['committer_date']) for c in commits]) to_delete = [v for k, v in existing.items() if k not in ["%s%s%s" % (sha, name.replace('refs/tags/', '').replace('^{}', ''), self.base_id) for sha, name in self.tags]] docs = [] for sha, name in self.tags: if sha in lookup: doc = {} doc['name'] = name.replace('refs/tags/', '').replace('^{}', '') doc['sha'] = sha doc['date'] = lookup[sha] doc['repo'] = self.base_id if c_tid(doc) in existing: continue docs.append(doc) if docs: logger.info('%s: %s tags will be indexed' % ( self.name, len(docs))) self.t.add_tags(docs) if to_delete: logger.info('%s: %s tags will be deleted' % ( self.name, len(to_delete))) self.t.del_tags(to_delete) def index(self, extract_workers=1): # Compile the parsers if self.parsers: if not self.parsers_compiled: raw_parsers = copy.deepcopy(self.parsers) self.parsers = [] for parser in raw_parsers: self.parsers.append(re.compile(parser)) logger.debug( "%s: Prepared %s regex parsers for commit msgs" % ( self.name, len(self.parsers))) self.parsers_compiled = True # check whether a commit should be completly deleted or # updated by removing the repo from the repos field if self.to_delete: delete_commits(self.c, self.name, self.to_delete, self.ref_id) # check whether a commit should be created or # updated by adding the repo into the repos field if self.to_index: to_create, to_update = self.compute_to_create_to_update() if to_create: logger.info("%s: %s commits will be created ..." % ( self.name, len(to_create))) self.run_workers(to_create, extract_workers) if to_update: logger.info( "%s: %s commits already indexed and need to be updated" % ( self.name, len(to_update))) for c in to_update: c['repos'].append(self.ref_id) self.c.update_commits(to_update)