Exemplo n.º 1
0
    def harvest_seeds(self, harvest, current_time):
        send_config = create_config("http_resource", {
            "resource": harvest.source.repository,
            "continuation_limit": 10000,
        })

        set_specification = harvest.source.spec
        scc, err = send(set_specification,
                        f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}",
                        config=send_config,
                        method="get")

        if len(err):
            Resource = apps.get_model(harvest.source.repository)
            error_counter = Counter([
                error.status for error in Resource.objects.filter(id__in=err)
            ])
            raise CommandError(
                f"Failed to harvest seeds from {harvest.source.name}: {error_counter}"
            )

        harvest.harvested_at = current_time
        harvest.save()

        return len(scc), len(err)
Exemplo n.º 2
0
def sync_sharekit_metadata():
    # Select which data to sync this run
    latest_active_dataset = Dataset.objects.filter(is_active=True).last()
    if not latest_active_dataset:
        return
    dataset_version = DatasetVersion.objects.get_current_version()
    harvest_queryset = Harvest.objects.filter(
        dataset=latest_active_dataset,
        source__repository=Repositories.SHAREKIT,
        stage=HarvestStages.
        COMPLETE  # prevents syncing materials half way a full harvest
    )
    # First we acquire a permanent lock on Harvests,
    # because if latest_update_at is a while ago this command will run a long time.
    # We don't want to keep all those syncing changes waiting in that one transaction.
    try:
        with atomic():
            harvest_queryset.filter(is_syncing=False).select_for_update(
                nowait=True).update(is_syncing=True)
    except DatabaseError:
        logger.warning(
            "Did not acquire lock on Harvester when syncing Sharekit metadata")
        return
    # Now that we're the only ones starting the sync we execute it
    for harvest in harvest_queryset.filter(is_syncing=True):
        # Check that a non-valid harvest source didn't slip through the lock
        if harvest.stage != HarvestStages.COMPLETE:
            logging.warning(
                "Encountered a non-complete harvest source during sync")
            continue
        # Recording which time will become latest_update_at
        current_time = make_aware(datetime.now())
        # Getting metadata from Sharekit and stop immediately if anything went wrong
        send_config = create_config("http_resource", {
            "resource": harvest.source.repository,
            "continuation_limit": 10000,
        })
        set_specification = harvest.source.spec
        scc, err = send(set_specification,
                        f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}",
                        config=send_config,
                        method="get")
        if len(err) or not len(scc):
            continue
        # Now parse the metadata and update current Collection for this Harvest
        seeds = get_harvest_seeds(Repositories.SHAREKIT,
                                  set_specification,
                                  harvest.latest_update_at,
                                  include_no_url=True)
        collection = dataset_version.collection_set.filter(
            name=harvest.source.spec).last()
        for seeds_batch in ibatch(seeds, batch_size=32):
            collection.update(seeds_batch, "external_id")
        # Last but not least we update the harvest update time to get a different delta later
        harvest.latest_update_at = current_time
        harvest.save()
    # And we release the syncing lock
    with atomic():
        harvest_queryset.filter(is_syncing=True).select_for_update().update(
            is_syncing=False)
Exemplo n.º 3
0
 def transcribe_video_resources(self, video_download_ids, seeds):
     no_paths_count, invalid_paths_count, success_count, error_count = 0, 0, 0, 0
     kaldi_file_paths = defaultdict(list)
     # Preprocess the videos
     for video_download_resource in YouTubeDLResource.objects.filter(
             id__in=video_download_ids, status=0):
         # Make sure that the video has a valid audio file
         _, data = video_download_resource.content
         file_path = data.get("file_path", None)
         if not file_path:
             no_paths_count += 1
             continue
         if not os.path.exists(file_path):
             invalid_paths_count += 1
             continue
         # Try to transcribe the file based on metadata
         video_url = video_download_resource.variables()["url"]
         # TODO: determine Kaldi through meta data
         # It's possible to pass the URL through get_edurep_basic_resources
         # With the Tika and File resources it's much easier to determine a proper language
         seed = seeds[video_url]
         title = seed.get("title", None)
         kaldi_model = get_kaldi_model_from_snippet(title)
         kaldi_file_paths[kaldi_model].append(file_path)
     no_language_count = len(kaldi_file_paths.pop(None, []))
     # Actual transcribing
     for kaldi_model, paths in kaldi_file_paths.items():
         config = create_config("shell_resource", {"resource": kaldi_model})
         sccs, errs = run_serie(self.progress([[path] for path in paths]),
                                [{} for _ in paths],
                                config=config)
         success_count += len(sccs)
         error_count += len(errs)
     return no_paths_count, invalid_paths_count, no_language_count, success_count, error_count
Exemplo n.º 4
0
    def extract_seeds(self, set_specification, latest_update):
        queryset = self.get_queryset().filter(
            set_specification=set_specification,
            since__date__gte=latest_update.date(),
            status=200,
            is_extracted=False)

        oaipmh_objective = {
            "@": EdurepDataExtraction.get_oaipmh_records,
            "external_id": EdurepDataExtraction.get_oaipmh_external_id,
            "state": EdurepDataExtraction.get_oaipmh_record_state
        }
        oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE)
        extract_config = create_config("extract_processor",
                                       {"objective": oaipmh_objective})
        prc = ExtractProcessor(config=extract_config)

        results = []
        for harvest in queryset:
            seed_resource = {
                "resource":
                f"{harvest._meta.app_label}.{harvest._meta.model_name}",
                "id": harvest.id,
                "success": True
            }
            try:
                for seed in prc.extract_from_resource(harvest):
                    seed["seed_resource"] = seed_resource
                    results.append(seed)
            except ValueError as exc:
                logger.warning("Invalid XML:", exc, harvest.uri)
        return results
Exemplo n.º 5
0
 def download_seed_videos(self, video_seeds):
     config = create_config("http_resource",
                            {"resource": "pol_harvester.YouTubeDLResource"})
     return run_serie(  # TODO: make this parallel
         self.progress([[seed["url"]] for seed in video_seeds]),
         [{} for _ in video_seeds],
         config=config)
Exemplo n.º 6
0
    def process_batch(self, batch):

        config = create_config(self.resource_type, self.config.retrieve_data)
        app_label, resource_model = config.resource.split(".")
        resource_type = ContentType.objects.get_by_natural_key(app_label, resource_model)

        updates = []
        creates = []
        for process_result in batch.processresult_set.all():
            args, kwargs = process_result.document.output(config.args, config.kwargs)
            successes, fails = self.dispatch_resource(config, *args, **kwargs)
            results = successes + fails
            if not len(results):
                continue
            result_id = results.pop(0)
            process_result.result_type = resource_type
            process_result.result_id = result_id
            updates.append(process_result)
            for result_id in results:
                # TODO: create docs here where necessary
                creates.append(
                    self.ProcessResult(document=process_result.document, batch=batch,
                                       result_id=result_id, result_type=resource_type)
                )
            self.ProcessResult.objects.bulk_create(creates)
            self.ProcessResult.objects.bulk_update(updates, ["result_type", "result_id"])
Exemplo n.º 7
0
    def extract_seeds(self, latest_update):
        queryset = self.get_queryset() \
            .filter(since__date__gte=latest_update.date(), status=200)

        metadata_objective = {
            "@": "$.items",
            "external_id": "$.uuid",
            "state": BuasMetadataExtraction.get_record_state
        }
        metadata_objective.update(BuasMetadataExtraction.OBJECTIVE)
        extract_config = create_config("extract_processor", {
            "objective": metadata_objective
        })
        prc = ExtractProcessor(config=extract_config)

        results = []
        for harvest in queryset:
            seed_resource = {
                "resource": f"{harvest._meta.app_label}.{harvest._meta.model_name}",
                "id": harvest.id,
                "success": True
            }
            for seed in prc.extract_from_resource(harvest):
                seed["seed_resource"] = seed_resource
                results.append(seed)
        return results
Exemplo n.º 8
0
def get_edurep_query_seeds(query):
    queryset = EdurepSearch.objects.filter(request__contains=query)

    api_objective = {
        "@": EdurepDataExtraction.get_api_records,
        "external_id": EdurepDataExtraction.get_api_external_id,
        "state": EdurepDataExtraction.get_api_record_state
    }
    api_objective.update(EDUREP_EXTRACTION_OBJECTIVE)
    extract_config = create_config("extract_processor",
                                   {"objective": api_objective})
    prc = ExtractProcessor(config=extract_config)

    results = []
    for search in queryset.filter(status=200):
        try:
            results += list(prc.extract_from_resource(search))
        except ValueError as exc:
            err.warning("Invalid XML:", exc, search.uri)
    seeds = {}
    for seed in sorted(results, key=lambda rsl: rsl["publisher_date"] or ""):
        # Some records in Edurep do not have any known URL
        # As we can't possibly process those we ignore them (silently)
        # If we want to fix this it should happen on Edurep's or Sharekit's side
        # We informed Kirsten van Veelo and Martine Teirlinck about the situation.
        if not seed["url"]:
            continue
        # We adjust url's of seeds if the source files are not at the URL
        # We should improve data extraction to always get source files
        if seed["mime_type"] == "application/x-Wikiwijs-Arrangement":
            seed["package_url"] = seed["url"]
            seed["url"] += "?p=imscp"
        # And deduplicate entire seeds based on URL
        seeds[seed["url"]] = seed
    return seeds.values()
Exemplo n.º 9
0
 def test_purge_after(self):
     instance = self.get_test_instance()
     instance.config = create_config("global", {
         "purge_after": {"days": 30}
     })
     instance.clean()
     self.assertIsNotNone(instance.purge_at)
     self.assertEqual(instance.purge_at.date() - date.today(), timedelta(days=30))
Exemplo n.º 10
0
 def test_create_config(self):
     test_config = create_config("name", {
         "test": "public",
         "_test2": "protected",
         "_test3": "protected 2"
     })
     self.assertIsNone(test_config._defaults)
     self.assertIsInstance(test_config, ConfigurationType)
     self.assertEqual(test_config.test, "public")
     self.assertEqual(test_config.test2, "protected")
     self.assertEqual(test_config.test3, "protected 2")
     self.assertEqual(test_config._test2, "protected")
     self.assertEqual(test_config._test3, "protected 2")
Exemplo n.º 11
0
 def test_create_config_registered_defaults(self):
     register_defaults("name", {"test4": "namespaced default"})
     test_config = create_config("name", {
         "test": "public",
         "_test2": "protected",
         "_test3": "protected 2"
     })
     self.assertIsNone(test_config._defaults)
     self.assertIsInstance(test_config, ConfigurationType)
     self.assertEqual(test_config._namespace, "name")
     self.assertEqual(test_config.test4, "namespaced default")
     self.assertEqual(test_config._defaults,
                      DATAGROWTH_DEFAULT_CONFIGURATION)
Exemplo n.º 12
0
 def test_create_config(self):
     test_config = create_config("name", {
         "test": "public",
         "_test2": "protected",
         "_test3": "protected 2"
     })
     self.assertIsNone(test_config._defaults)
     self.assertIsInstance(test_config, ConfigurationType)
     self.assertEqual(test_config.test, "public")
     self.assertEqual(test_config.test2, "protected")
     self.assertEqual(test_config.test3, "protected 2")
     self.assertEqual(test_config._test2, "protected")
     self.assertEqual(test_config._test3, "protected 2")
Exemplo n.º 13
0
 def test_create_config_registered_defaults(self):
     register_defaults("name", {
         "test4": "namespaced default"
     })
     test_config = create_config("name", {
         "test": "public",
         "_test2": "protected",
         "_test3": "protected 2"
     })
     self.assertIsNone(test_config._defaults)
     self.assertIsInstance(test_config, ConfigurationType)
     self.assertEqual(test_config._namespace, "name")
     self.assertEqual(test_config.test4, "namespaced default")
     self.assertEqual(test_config._defaults, DEFAULT_CONFIGURATION)
Exemplo n.º 14
0
def get_edurep_oaipmh_seeds(set_specification,
                            latest_update,
                            include_deleted=True):
    queryset = EdurepOAIPMH.objects\
        .filter(set_specification=set_specification, since__date__gte=latest_update.date(), status=200)

    oaipmh_objective = {
        "@": EdurepDataExtraction.get_oaipmh_records,
        "external_id": EdurepDataExtraction.get_oaipmh_external_id,
        "state": EdurepDataExtraction.get_oaipmh_record_state
    }
    oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE)
    extract_config = create_config("extract_processor",
                                   {"objective": oaipmh_objective})
    prc = ExtractProcessor(config=extract_config)

    results = []
    for harvest in queryset:
        try:
            results += list(prc.extract_from_resource(harvest))
        except ValueError as exc:
            err.warning("Invalid XML:", exc, harvest.uri)
    seeds = []
    for seed in results:
        # Some records in Edurep do not have any known URL
        # As we can't possibly process those we ignore them (silently)
        # If we want to fix this it should happen on Edurep's or Sharekit's side
        # We informed Kirsten van Veelo and Martine Teirlinck about the situation.
        if seed["state"] == "active" and not seed["url"]:
            continue
        # We adjust url's of seeds if the source files are not at the URL
        # We should improve data extraction to always get source files
        if seed["mime_type"] == "application/x-Wikiwijs-Arrangement" and seed.get(
                "url", None):
            seed["package_url"] = seed["url"]
            seed["url"] += "?p=imscp"
        # We deduplicate based on the external_id a UID by Edurep
        seeds.append(seed)
    # Now we'll mark any invalid seeds as deleted to make sure they disappear
    # Invalid seeds have a copyright or are of insufficient education level
    for seed in seeds:
        if not seed["copyright"] or seed["copyright"] == "no":
            seed["state"] = "deleted"
        if seed["lowest_educational_level"] < 1:  # lower level than MBO
            seed["state"] = "deleted"
    # And we return the seeds based on whether to include deleted or not
    return seeds if include_deleted else \
        [result for result in seeds if result.get("state", "active") == "active"]
Exemplo n.º 15
0
    def handle(self, *args, **options):

        freeze_name = options["freeze"]
        dummy = options["dummy"]

        if not dummy:
            self.prepare_harvest(freeze_name)

        harvest_queryset = EdurepHarvest.objects.filter(
            freeze__name=freeze_name, stage=HarvestStages.NEW)
        if not harvest_queryset.exists():
            raise EdurepHarvest.DoesNotExist(
                f"There are no NEW EdurepHarvest objects for '{freeze_name}'")

        self.header("EDUREP SEEDS HARVEST", options)

        # Calling the Edurep OAI-PMH interface and get the Edurep meta data about learning materials
        self.info("Fetching metadata for sources ...")
        send_config = create_config("http_resource", {
            "resource": "edurep.EdurepOAIPMH",
            "continuation_limit": 1000,
        })
        current_time = now()
        successes = defaultdict(int)
        fails = defaultdict(int)
        for harvest in self.progress(harvest_queryset,
                                     total=harvest_queryset.count()):
            set_specification = harvest.source.collection_name
            scc, err = send(set_specification,
                            f"{harvest.latest_update_at:%Y-%m-%d}",
                            config=send_config,
                            method="get")
            if len(err):
                raise CommandError(
                    "Failed to harvest seeds from Edurep OAI-PMH")
            successes[set_specification] += len(scc)
            fails[set_specification] += len(err)
            if not dummy:
                harvest.harvested_at = current_time
                harvest.save()
        self.info('Failed OAI-PMH calls: ', fails)
        self.info('Successful OAI-PMH calls: ', successes)
        success_count = sum(successes.values())
        fail_count = sum(fails.values())
        return f'OAI-PMH: {success_count}/{success_count+fail_count}'
Exemplo n.º 16
0
    def merge_batch(self, batch):

        pipeline_phase = self.config.pipeline_phase
        config = create_config("extract_processor", self.config.contribute_data)
        contribution_processor = config.extractor
        contribution_property = config.to_property

        while True:

            documents = []
            for process_result in batch.processresult_set.filter(result_id__isnull=False):
                result = process_result.result
                # Write results to the pipeline
                process_result.document.pipeline[pipeline_phase] = {
                    "success": result.success,
                    "resource": f"{result._meta.app_label}.{result._meta.model_name}",
                    "id": result.id
                }
                documents.append(process_result.document)
                # Write data to the Document
                extractor_name, method_name = Processor.get_processor_components(contribution_processor)
                extractor_class = Processor.get_processor_class(extractor_name)
                extractor = extractor_class(config)
                extractor_method = getattr(extractor, method_name)
                contributions = list(extractor_method(result))
                if not len(contributions):
                    continue
                contribution = contributions.pop(0)
                # TODO: create docs here where necessary
                if contribution_property is None:
                    process_result.document.properties.update(contribution)
                else:
                    process_result.document.properties[contribution_property] = contribution

            # We'll be locking the Documents for update to prevent accidental overwrite of parallel results
            with transaction.atomic():
                try:
                    list(self.Document.objects.filter(id__in=[doc.id for doc in documents]).select_for_update())
                except transaction.DatabaseError:
                    continue
                self.Document.objects.bulk_update(documents, ["pipeline", "properties"])
                break
Exemplo n.º 17
0
    def extract_seeds(self, latest_update):
        latest_update = latest_update.replace(microsecond=0)
        queryset = self.get_queryset().filter(since__gte=latest_update,
                                              status=200,
                                              is_extracted=False)

        extract_config = create_config("extract_processor",
                                       {"objective": self._create_objective()})
        prc = HanzeResourceObjectExtraction(config=extract_config)

        results = []
        for harvest in queryset:
            seed_resource = {
                "resource":
                f"{harvest._meta.app_label}.{harvest._meta.model_name}",
                "id": harvest.id,
                "success": True
            }
            for seed in prc.extract_from_resource(harvest):
                seed["seed_resource"] = seed_resource
                results.append(seed)
        return results
Exemplo n.º 18
0
def edit_document_webhook(request, channel, secret):
    # Webhook validation
    if str(secret) != settings.HARVESTER_WEBHOOK_SECRET:
        return HttpResponse(status=403,
                            reason="Webhook not allowed in this environment")
    if request.META[
            "HTTP_X_FORWARDED_FOR"] not in settings.SHAREKIT_WEBHOOK_ALLOWED_IPS:
        capture_message(
            f"edit_document_webhook called from invalid IP: {request.META['HTTP_X_FORWARDED_FOR']}",
            level="warning")
        return HttpResponse(status=403,
                            reason="Webhook not allowed from source")
    try:
        data = json.loads(request.body)
    except json.decoder.JSONDecodeError:
        return HttpResponse(status=400, reason="Invalid JSON")
    # Patches data coming from Sharekit to be consistent
    if isinstance(data["attributes"], list):
        data["attributes"] = {}
    # Processing of incoming data
    extract_config = create_config(
        "extract_processor",
        {"objective": create_objective(root="$", include_is_restricted=False)})
    prc = SharekitMetadataExtraction(config=extract_config)
    seed = next(prc.extract("application/json", data))
    seed["is_restricted"] = channel == "edusourcesprivate"
    prepare_seed(seed)
    # Commit changes to the database
    dataset_version = DatasetVersion.objects.get_current_version()
    collection = dataset_version.collection_set.filter(name=channel).last()
    collection.update([seed], "external_id")
    # Finish webhook request
    logger = HarvestLogger(dataset_version.dataset.name,
                           "edit_document_webhook", {})
    logger.report_material(seed["external_id"],
                           title=seed["title"],
                           url=seed["url"])
    return HttpResponse("ok")
    def handle(self, *args, **options):

        freeze = Freeze.objects.get(name=options["freeze"])

        videos = [(
            doc.reference,
            doc.properties["url"],
        ) for doc in freeze.documents.filter(
            reference__in=HBOVPK_TEST_REFERENCES)]
        successes = []
        errors = []

        for ref, url in tqdm(videos):
            try:
                download = YouTubeDLResource().run(url)
            except DGShellError:
                print("Download does not exist")
                continue
            if not download.success:
                print("Download error")
                continue
            _, data = download.content
            file_path = data.get("file_path", None)
            if not file_path:
                print("Download missing file in output")
                continue

            config = create_config("shell_resource", {
                "resource": "pol_harvester.kaldinlresource",
                "reference": ref
            })
            if not os.path.exists(file_path):
                print("Download missing file")
                continue
            sccs, errs = run(file_path, config=config)
            successes += sccs
            errors += errs
Exemplo n.º 20
0
    def handle(self, *args, **options):

        language = options["language"]
        category_namespace = self.CATEGORY_NAMESPACES[language]
        categories = options["categories"]
        corpus_name = "-".join(
            sorted([
                category.replace(category_namespace, "")
                for category in categories
            ]))

        results = []
        for category in categories:
            category_name = category.replace(category_namespace, "")

            send_config = create_config(
                "http_resource", {
                    "resource": "pol_harvester.wikipediacategorymembers",
                    "wiki_country": language,
                    "continuation_limit": 100
                })
            scc, err = send(category, config=send_config, method="get")
            print(f"Send {category_name}:", scc, err)
            resources = WikipediaCategoryMembers.objects.filter(id__in=scc)

            extract_config = {
                "objective": {
                    "@": "$.query.pages",
                    "pageid": "$.pageid",
                    "title": "$.title",
                    "categories": "$.categories",
                    "wikidata": "$.pageprops.wikibase_item",
                    "wikitext": "$.revisions.0.slots.main.*"
                }
            }
            prc = ExtractProcessor(config=extract_config)
            for resource in resources:
                results += prc.extract_from_resource(resource)

        corpus, created = Corpus.objects.get_or_create(name=corpus_name,
                                                       identifier="pageid",
                                                       schema={})
        articles = []
        for result in results:
            if not result["wikitext"]:
                continue
            result["text"] = self.clean_text(
                mwparserfromhell.parse(result["wikitext"]).strip_code(),
                category_namespace)
            articles.append(
                Article(properties=result, collection=corpus, schema={}))
        corpus.add(articles, reset=True)

        vectorizer = CountVectorizer()
        vectorizer.fit_transform([
            self.clean_text(doc.properties["text"], category_namespace)
            for doc in corpus.documents.all()
        ])
        dst = os.path.join(datagrowth_settings.DATAGROWTH_DATA_DIR,
                           "custom_vocabulary", language)
        os.makedirs(dst, exist_ok=True)
        joblib.dump(vectorizer, os.path.join(dst, corpus_name + ".pkl"))