def _process_data(repo, uncompressed_pack, progress): logger.info('Dirtying objects for %s' % repo) type_mapper = {} for obj in uncompressed_pack.iterobjects(): type_mapper[obj.id] = obj._type dirty = _objectify(id=obj.id, type=obj._type) dirty.mark_dirty(True) dirty.add_repository(repo) dirty.save() logger.info('Constructed object type map of size %s (%d bytes) for %s' % (len(type_mapper), type_mapper.__sizeof__(), repo)) models.flush() logger.info('Now processing objects for %s' % repo) for obj in uncompressed_pack.iterobjects(): _process_object(repo=repo, obj=obj, progress=progress, type_mapper=type_mapper) logger.info('Cleaning objects for %s' % repo) for id, type in type_mapper.iteritems(): dirty = _objectify(id=id, type=type) dirty.mark_dirty(False) dirty.save()
def do_request(self): urls = request.params.get("url", "").strip() if not urls: helpers.error("You did not provide a URL.") for url in urls.split("\n"): url = models.Repository.canonicalize(url.strip()) if not url: continue if models.Repository.exists(url=url): repo = models.Repository.get_by_attributes(url=url) if repo.approved: helpers.flash("Someone has already requested indexing of %s, " "so no worries." % url) else: if not url.startswith("git://"): helpers.flash( "That repo (%s) has been already requested. At the " "moment, anygit only supports git protocol (git://) " "repositories. Once we've added support for this " "repo's protocol, we'll index it." % url ) elif not fetch.check_validity(repo): helpers.error( "That's odd... someone already asked for %s, but it looks " "to us like we can't talk to that repo. Is there a typo " "in there? If not, please email [email protected]." % url ) else: repo.approved = True repo.save() helpers.flash( "Someone had requested %s before but it was down then. " "Looks like it's back up now. We'll get right to it." % url ) else: repo = models.Repository.create(url=url) if not url.startswith("git://"): helpers.flash( "Successfully requested %s for future indexing. However, " "please note that only git protocol (git://) " "repositories are currently supported by anygit." % url ) # Make sure we can talk to it elif not fetch.check_validity(repo): helpers.error("Could not talk to %s; are you sure it's a valid URL?" % url) else: repo.approved = True repo.save() helpers.flash("Successfully requested %s for indexing." % url) models.flush() redirect_to("/")
def fetch_and_index(repo, recover_mode=False, packfile=None, batch=None, unpack=False): check_for_die_file() if isinstance(repo, basestring): repo = models.Repository.get(repo) repo.refresh() # There's a race condition here where two indexing processes might # try to index the same repo. However, since it's idempotent, # this is not harmful beyond wasting resources. However, we check # here to try to minimize the damage. if repo.indexing: logger.error('Repo is already being indexed') return logger.info('Beginning to index: %s' % repo) now = datetime.datetime.now() data_path = None try: # Don't let other people try to index in parallel repo.indexing = True repo.dirty = True repo.save() models.flush() state = {} while True: data_path = fetch(repo, recover_mode=recover_mode, packfile=packfile, batch=batch, state=state) index_data(data_path, repo, is_path=True, unpack=unpack) if not state.get('has_extra'): break else: logger.info('Still more remote heads, running again...') repo.count = repo.count_objects() repo.last_index = now repo.been_indexed = True repo.approved = True repo.dirty = False # Finally, clobber the old remote heads. repo.set_remote_heads(repo.new_remote_heads) repo.set_new_remote_heads([]) repo.save() refresh_all_counts(all=False) except DeadRepo: logger.error('Marking %s as dead' % repo) repo.approved = 0 repo.save() except KeyboardInterrupt: logger.info('^C pushed; exiting thread') raise except Exception, e: logger.error('Had a problem indexing %s: %s' % (repo, traceback.format_exc()))
def refresh_all_counts(all=None): aggregator = models.Aggregate.get() aggregator.refresh_all_counts(all=all) aggregator.save() models.flush()
repo.been_indexed = True # Finally, clobber the old remote heads. repo.set_remote_heads(repo.new_remote_heads) repo.set_new_remote_heads([]) repo.save() except Exception, e: logger.error('Had a problem: %s' % traceback.format_exc()) finally: if not packfile and data_path: try: os.unlink(data_path) except IOError, e: logger.error('Could not remove tmpfile %s.: %s' % (data_path, e)) repo.indexing = False repo.save() models.flush() logger.info('Done with %s' % repo) def fetch_and_index_threaded(repo): models.setup() try: return fetch_and_index(repo) except DieFile: # TODO: do something to terminate the controller process too sys.exit(1) except: logger.error(traceback.format_exc()) raise def index_all(last_index=None, threads=1): repos = list(models.Repository.get_indexed_before(last_index))
def create(url): canonical_url = models.Repository.canonicalize(url) r = models.Repository.get_or_create(url=canonical_url) r.approved = 'spidered' r.save() models.flush()