Пример #1
0
    def setUp(self):
        super(TestCase, self).setUp()

        self.content_types = [
            "text/html", "text/xml", "application/json", "nothing/quantum"
        ]

        self.html_obj = {
            "@": "soup.find_all('a')",
            "text": "el.text",
            "link": "el['href']",
            "#page": "soup.find('title').text",
        }
        self.html_prc = ExtractProcessor(config={"objective": self.html_obj})
        self.soup = BeautifulSoup(MOCK_HTML, "html5lib")

        self.xml_obj = {
            "@": "soup.find_all('result')",
            "text": "el.find('label').text",
            "link": "el.find('url').text",
            "#page": "soup.find('title').text",
        }
        self.xml_prc = ExtractProcessor(config={"objective": self.xml_obj})
        self.xml = BeautifulSoup(MOCK_XML, "lxml")

        self.json_obj = {
            "@": "$.records",
            "#unicode": "$.unicode.0",
            "#goal": "$.dict.dict.test",
            "id": "$.id",
            "record": "$.record"
        }
        self.json_prc = ExtractProcessor(config={"objective": self.json_obj})
        self.json_records = MOCK_DATA_WITH_RECORDS
        self.json_dict = MOCK_DATA_WITH_KEYS

        self.test_resources_data = [
            self.soup, self.xml, self.json_records, None
        ]
        self.test_resources_extractions = [
            MOCK_SCRAPE_DATA, MOCK_SCRAPE_DATA, MOCK_JSON_DATA, None
        ]
        self.test_resources = [(
            Mock(content=(content_type, data)),
            processor,
        ) for content_type, data, processor in zip(
            self.content_types, self.test_resources_data,
            [self.html_prc, self.xml_prc, self.json_prc, self.html_prc])]
Пример #2
0
    def extract_seeds(self, set_specification, latest_update):
        queryset = self.get_queryset().filter(
            set_specification=set_specification,
            since__date__gte=latest_update.date(),
            status=200,
            is_extracted=False)

        oaipmh_objective = {
            "@": EdurepDataExtraction.get_oaipmh_records,
            "external_id": EdurepDataExtraction.get_oaipmh_external_id,
            "state": EdurepDataExtraction.get_oaipmh_record_state
        }
        oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE)
        extract_config = create_config("extract_processor",
                                       {"objective": oaipmh_objective})
        prc = ExtractProcessor(config=extract_config)

        results = []
        for harvest in queryset:
            seed_resource = {
                "resource":
                f"{harvest._meta.app_label}.{harvest._meta.model_name}",
                "id": harvest.id,
                "success": True
            }
            try:
                for seed in prc.extract_from_resource(harvest):
                    seed["seed_resource"] = seed_resource
                    results.append(seed)
            except ValueError as exc:
                logger.warning("Invalid XML:", exc, harvest.uri)
        return results
Пример #3
0
 def test_application_json_dict(self):
     self.json_obj["@"] = "$.keys"
     keys_processor = ExtractProcessor(config={"objective": self.json_obj})
     rsl = keys_processor.application_json(self.json_dict)
     self.assertEqual(list(rsl), MOCK_JSON_DATA)
     self.assertIsInstance(rsl, GeneratorType,
                           "Extractors are expected to return generators.")
Пример #4
0
    def extract_seeds(self, latest_update):
        queryset = self.get_queryset() \
            .filter(since__date__gte=latest_update.date(), status=200)

        metadata_objective = {
            "@": "$.items",
            "external_id": "$.uuid",
            "state": BuasMetadataExtraction.get_record_state
        }
        metadata_objective.update(BuasMetadataExtraction.OBJECTIVE)
        extract_config = create_config("extract_processor", {
            "objective": metadata_objective
        })
        prc = ExtractProcessor(config=extract_config)

        results = []
        for harvest in queryset:
            seed_resource = {
                "resource": f"{harvest._meta.app_label}.{harvest._meta.model_name}",
                "id": harvest.id,
                "success": True
            }
            for seed in prc.extract_from_resource(harvest):
                seed["seed_resource"] = seed_resource
                results.append(seed)
        return results
Пример #5
0
def get_edurep_query_seeds(query):
    queryset = EdurepSearch.objects.filter(request__contains=query)

    api_objective = {
        "@": EdurepDataExtraction.get_api_records,
        "external_id": EdurepDataExtraction.get_api_external_id,
        "state": EdurepDataExtraction.get_api_record_state
    }
    api_objective.update(EDUREP_EXTRACTION_OBJECTIVE)
    extract_config = create_config("extract_processor",
                                   {"objective": api_objective})
    prc = ExtractProcessor(config=extract_config)

    results = []
    for search in queryset.filter(status=200):
        try:
            results += list(prc.extract_from_resource(search))
        except ValueError as exc:
            err.warning("Invalid XML:", exc, search.uri)
    seeds = {}
    for seed in sorted(results, key=lambda rsl: rsl["publisher_date"] or ""):
        # Some records in Edurep do not have any known URL
        # As we can't possibly process those we ignore them (silently)
        # If we want to fix this it should happen on Edurep's or Sharekit's side
        # We informed Kirsten van Veelo and Martine Teirlinck about the situation.
        if not seed["url"]:
            continue
        # We adjust url's of seeds if the source files are not at the URL
        # We should improve data extraction to always get source files
        if seed["mime_type"] == "application/x-Wikiwijs-Arrangement":
            seed["package_url"] = seed["url"]
            seed["url"] += "?p=imscp"
        # And deduplicate entire seeds based on URL
        seeds[seed["url"]] = seed
    return seeds.values()
Пример #6
0
 def get_xml_processor(self, callables=False):
     at = "soup.find_all('result')" if not callables else ExtractTextImplementation.get_xml_elements
     link = "el.find('url').text" if not callables else ExtractTextImplementation.get_xml_link
     page = "soup.find('title').text" if not callables else ExtractTextImplementation.get_page_text
     objective = {
         "@": at,
         "text": "el.find('label').text",
         "link": link,
         "#page": page,
     }
     return ExtractProcessor(config={"objective": objective})
Пример #7
0
def get_edurep_oaipmh_seeds(set_specification,
                            latest_update,
                            include_deleted=True):
    queryset = EdurepOAIPMH.objects\
        .filter(set_specification=set_specification, since__date__gte=latest_update.date(), status=200)

    oaipmh_objective = {
        "@": EdurepDataExtraction.get_oaipmh_records,
        "external_id": EdurepDataExtraction.get_oaipmh_external_id,
        "state": EdurepDataExtraction.get_oaipmh_record_state
    }
    oaipmh_objective.update(EDUREP_EXTRACTION_OBJECTIVE)
    extract_config = create_config("extract_processor",
                                   {"objective": oaipmh_objective})
    prc = ExtractProcessor(config=extract_config)

    results = []
    for harvest in queryset:
        try:
            results += list(prc.extract_from_resource(harvest))
        except ValueError as exc:
            err.warning("Invalid XML:", exc, harvest.uri)
    seeds = []
    for seed in results:
        # Some records in Edurep do not have any known URL
        # As we can't possibly process those we ignore them (silently)
        # If we want to fix this it should happen on Edurep's or Sharekit's side
        # We informed Kirsten van Veelo and Martine Teirlinck about the situation.
        if seed["state"] == "active" and not seed["url"]:
            continue
        # We adjust url's of seeds if the source files are not at the URL
        # We should improve data extraction to always get source files
        if seed["mime_type"] == "application/x-Wikiwijs-Arrangement" and seed.get(
                "url", None):
            seed["package_url"] = seed["url"]
            seed["url"] += "?p=imscp"
        # We deduplicate based on the external_id a UID by Edurep
        seeds.append(seed)
    # Now we'll mark any invalid seeds as deleted to make sure they disappear
    # Invalid seeds have a copyright or are of insufficient education level
    for seed in seeds:
        if not seed["copyright"] or seed["copyright"] == "no":
            seed["state"] = "deleted"
        if seed["lowest_educational_level"] < 1:  # lower level than MBO
            seed["state"] = "deleted"
    # And we return the seeds based on whether to include deleted or not
    return seeds if include_deleted else \
        [result for result in seeds if result.get("state", "active") == "active"]
Пример #8
0
 def get_json_processor(self,
                        callables=False,
                        object_values=False,
                        from_dict=False):
     if not object_values and not from_dict:
         at = "$.records" if not callables else ExtractJSONImplementation.get_nodes
     elif from_dict:
         at = "$.records.0" if not callables else ExtractJSONImplementation.get_dict
     elif object_values:
         at = "$.keys" if not callables else ExtractJSONImplementation.get_keys_nodes
     unicode = "$.unicode.0" if not callables else ExtractJSONImplementation.get_json_unicode
     id = "$.id" if not callables else ExtractJSONImplementation.get_json_id
     objective = {
         "@": at,
         "#unicode": unicode,
         "#goal": "$.dict.dict.test",
         "id": id,
         "record": "$.record"
     }
     return ExtractProcessor(config={
         "objective": objective,
         "extract_from_object_values": object_values
     })