def sync_sharekit_metadata(): # Select which data to sync this run latest_active_dataset = Dataset.objects.filter(is_active=True).last() if not latest_active_dataset: return dataset_version = DatasetVersion.objects.get_current_version() harvest_queryset = Harvest.objects.filter( dataset=latest_active_dataset, source__repository=Repositories.SHAREKIT, stage=HarvestStages. COMPLETE # prevents syncing materials half way a full harvest ) # First we acquire a permanent lock on Harvests, # because if latest_update_at is a while ago this command will run a long time. # We don't want to keep all those syncing changes waiting in that one transaction. try: with atomic(): harvest_queryset.filter(is_syncing=False).select_for_update( nowait=True).update(is_syncing=True) except DatabaseError: logger.warning( "Did not acquire lock on Harvester when syncing Sharekit metadata") return # Now that we're the only ones starting the sync we execute it for harvest in harvest_queryset.filter(is_syncing=True): # Check that a non-valid harvest source didn't slip through the lock if harvest.stage != HarvestStages.COMPLETE: logging.warning( "Encountered a non-complete harvest source during sync") continue # Recording which time will become latest_update_at current_time = make_aware(datetime.now()) # Getting metadata from Sharekit and stop immediately if anything went wrong send_config = create_config("http_resource", { "resource": harvest.source.repository, "continuation_limit": 10000, }) set_specification = harvest.source.spec scc, err = send(set_specification, f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}", config=send_config, method="get") if len(err) or not len(scc): continue # Now parse the metadata and update current Collection for this Harvest seeds = get_harvest_seeds(Repositories.SHAREKIT, set_specification, harvest.latest_update_at, include_no_url=True) collection = dataset_version.collection_set.filter( name=harvest.source.spec).last() for seeds_batch in ibatch(seeds, batch_size=32): collection.update(seeds_batch, "external_id") # Last but not least we update the harvest update time to get a different delta later harvest.latest_update_at = current_time harvest.save() # And we release the syncing lock with atomic(): harvest_queryset.filter(is_syncing=True).select_for_update().update( is_syncing=False)
def test_send(self): # Test makes equivalent call of HttpResourceProcessor.fetch.delay("test") scc, err = send(query="test", method=self.method, config=self.config, session=self.session) self.check_results(scc, 1) self.check_results(err, 0) # Similar but with a cached result scc, err = send(query="success", method=self.method, config=self.config, session=self.session) self.check_results(scc, 1) self.check_results(err, 0) # And with an error response scc, err = send(query="404", method=self.method, config=self.config, session=self.session) self.check_results(scc, 0) self.check_results(err, 1) scc, err = send(query="500", method=self.method, config=self.config, session=self.session) self.check_results(scc, 0) self.check_results(err, 1)
def harvest_seeds(self, harvest, current_time): send_config = create_config("http_resource", { "resource": harvest.source.repository, "continuation_limit": 10000, }) set_specification = harvest.source.spec scc, err = send(set_specification, f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}", config=send_config, method="get") if len(err): Resource = apps.get_model(harvest.source.repository) error_counter = Counter([ error.status for error in Resource.objects.filter(id__in=err) ]) raise CommandError( f"Failed to harvest seeds from {harvest.source.name}: {error_counter}" ) harvest.harvested_at = current_time harvest.save() return len(scc), len(err)
def handle(self, *args, **options): freeze_name = options["freeze"] dummy = options["dummy"] if not dummy: self.prepare_harvest(freeze_name) harvest_queryset = EdurepHarvest.objects.filter( freeze__name=freeze_name, stage=HarvestStages.NEW) if not harvest_queryset.exists(): raise EdurepHarvest.DoesNotExist( f"There are no NEW EdurepHarvest objects for '{freeze_name}'") self.header("EDUREP SEEDS HARVEST", options) # Calling the Edurep OAI-PMH interface and get the Edurep meta data about learning materials self.info("Fetching metadata for sources ...") send_config = create_config("http_resource", { "resource": "edurep.EdurepOAIPMH", "continuation_limit": 1000, }) current_time = now() successes = defaultdict(int) fails = defaultdict(int) for harvest in self.progress(harvest_queryset, total=harvest_queryset.count()): set_specification = harvest.source.collection_name scc, err = send(set_specification, f"{harvest.latest_update_at:%Y-%m-%d}", config=send_config, method="get") if len(err): raise CommandError( "Failed to harvest seeds from Edurep OAI-PMH") successes[set_specification] += len(scc) fails[set_specification] += len(err) if not dummy: harvest.harvested_at = current_time harvest.save() self.info('Failed OAI-PMH calls: ', fails) self.info('Successful OAI-PMH calls: ', successes) success_count = sum(successes.values()) fail_count = sum(fails.values()) return f'OAI-PMH: {success_count}/{success_count+fail_count}'
def handle(self, *args, **options): language = options["language"] category_namespace = self.CATEGORY_NAMESPACES[language] categories = options["categories"] corpus_name = "-".join( sorted([ category.replace(category_namespace, "") for category in categories ])) results = [] for category in categories: category_name = category.replace(category_namespace, "") send_config = create_config( "http_resource", { "resource": "pol_harvester.wikipediacategorymembers", "wiki_country": language, "continuation_limit": 100 }) scc, err = send(category, config=send_config, method="get") print(f"Send {category_name}:", scc, err) resources = WikipediaCategoryMembers.objects.filter(id__in=scc) extract_config = { "objective": { "@": "$.query.pages", "pageid": "$.pageid", "title": "$.title", "categories": "$.categories", "wikidata": "$.pageprops.wikibase_item", "wikitext": "$.revisions.0.slots.main.*" } } prc = ExtractProcessor(config=extract_config) for resource in resources: results += prc.extract_from_resource(resource) corpus, created = Corpus.objects.get_or_create(name=corpus_name, identifier="pageid", schema={}) articles = [] for result in results: if not result["wikitext"]: continue result["text"] = self.clean_text( mwparserfromhell.parse(result["wikitext"]).strip_code(), category_namespace) articles.append( Article(properties=result, collection=corpus, schema={})) corpus.add(articles, reset=True) vectorizer = CountVectorizer() vectorizer.fit_transform([ self.clean_text(doc.properties["text"], category_namespace) for doc in corpus.documents.all() ]) dst = os.path.join(datagrowth_settings.DATAGROWTH_DATA_DIR, "custom_vocabulary", language) os.makedirs(dst, exist_ok=True) joblib.dump(vectorizer, os.path.join(dst, corpus_name + ".pkl"))
def dispatch_resource(self, config, *args, **kwargs): return send(*args, **kwargs, config=config, method=config.method)
def test_send_inserted_session_provider(self, get_resource_link_mock): send("test", method=self.method, config=self.config, session="ProcessorMock") args, kwargs = get_resource_link_mock.call_args config, session = args self.assertTrue(session.from_provider)
def test_send_inserted_session(self): scc, err = send(query="test", method=self.method, config=self.config, session=MockRequestsWithAgent) self.check_results(scc, 1) self.check_results(err, 0) link = HttpResourceMock.objects.get(id=scc[0]) self.assertIn("user-agent", link.head)
def test_send_continuation(self): self.config.continuation_limit = 10 scc, err = send(query="next", method=self.method, config=self.config, session=self.session) self.check_results(scc, 2) self.check_results(err, 0)
def test_send_continuation_prohibited(self): scc, err = send(query="next", method=self.method, config=self.config, session=self.session) self.check_results(scc, 1) self.check_results(err, 0)