def retrieve_files_in_repo(self, repo_name): repo = Repository(repo_name) # Skip if repository was done already if repo.retrieved: self._logger.info( self._repo_fmt.format(label='Already done:', full_name=repo.name, id=repo.id, url=repo.url)) return # Skip if repository has no contents URL if not repo.contents_url: self._logger.info( self._repo_fmt.format(label='No contents URL found:', full_name=repo.name, id=repo.id, url=repo.url)) return # Do retrieving contents from GitHub self._logger.info( self._repo_fmt.format(label='Retrieving:', full_name=repo.name, id=repo.id, url=repo.url)) added = False for file in self._retriever.traverse(repo.contents_url): assert self.is_running() if not Repository.expects_file(file.path): self._logger.info(' (-) %s' % file.path) continue self._logger.info(' (+) %s' % file.path) self._retriever.retrieve_content(file) repo.add_file(file.path, file.decoded_content) if not added: added = True # Find packages if files found if added: self._logger.info(' --> Finding packages...') repo.find_packages() # Do nothing if no file found else: self._logger.info(' --> No expected files found.') # Save repository self._logger.info(' --> Saving repository...') repo.set_retrieved(True) repo.commit_changes()
def search_repos_in_slice(self, time_slice): self._logger.info('Searching time slice: %s' % time_slice) self._search.search(created=time_slice) for repo in self._search.traverse(): assert self.is_running() if Repository.exists(repo.full_name): self._logger.info( self._repo_fmt.format(label='Existed', **repo.__dict__)) continue self._logger.info( self._repo_fmt.format(label='Found', **repo.__dict__)) # Newly create repo in database newrepo = Repository(repo.full_name) newrepo.set_id(repo.id) newrepo.set_url(repo.url) newrepo.set_contents_url(repo.contents_url) newrepo.commit_changes() # Queue repository for later retrieving if self._repos is not None: self._repos.put(repo.full_name)