Exemplo n.º 1
0
def get_romania_curata():
        
    from os import path
    from difflib import SequenceMatcher as sm
    from itertools import permutations
    import json
    from mptracker.nlp import normalize

    sql_names = [person.name for person in models.Person.query.all()]

    with open(path.relpath("mptracker/scraper/scraper_curata_out.json"),
              'r', encoding='utf-8') as f:
        scraper_result = json.load(f)

    with open(path.relpath(
            'mptracker/scraper/romania_curata_exceptions.json'),
            'r', encoding='utf-8') as f:
        person_exceptions = json.load(f)


    def matching_score(first_name, second_name):
        return sm(None, first_name, second_name).ratio() * 100

    def add_person(name, fortune):
        person = (
            models.Person.query
                .filter_by(name=name)
                .first()
        )
        if person != None:
            person.romania_curata = "\n".join(fortune)
            print("Found a match for ", name.encode('utf-8'))
            sql_names.remove(name)

    for name, fortune in scraper_result:
        name_scraper = normalize(name)
        max_matching = (0, 0)

        if name_scraper in person_exceptions:
            add_person(person_exceptions[name_scraper], fortune)

        for temporary_sqlname in sql_names:
            name_sql = normalize(temporary_sqlname)
            for perm in permutations(name_scraper.split(" ")):
                current_matching = matching_score(" ".join(perm), name_sql)

                if max_matching[0] < current_matching:
                    max_matching = (current_matching, temporary_sqlname)

        if max_matching[0] > 93:
            add_person(max_matching[1], fortune)

    models.db.session.commit()
Exemplo n.º 2
0
def get_romania_curata():

    from os import path
    from difflib import SequenceMatcher as sm
    from itertools import permutations
    import json
    from mptracker.nlp import normalize

    sql_names = [person.name for person in models.Person.query.all()]

    with open(path.relpath("mptracker/scraper/scraper_curata_out.json"),
              'r',
              encoding='utf-8') as f:
        scraper_result = json.load(f)

    with open(path.relpath('mptracker/scraper/romania_curata_exceptions.json'),
              'r',
              encoding='utf-8') as f:
        person_exceptions = json.load(f)

    def matching_score(first_name, second_name):
        return sm(None, first_name, second_name).ratio() * 100

    def add_person(name, fortune):
        person = (models.Person.query.filter_by(name=name).first())
        if person != None:
            person.romania_curata = "\n".join(fortune)
            print("Found a match for ", name.encode('utf-8'))
            sql_names.remove(name)

    for name, fortune in scraper_result:
        name_scraper = normalize(name)
        max_matching = (0, 0)

        if name_scraper in person_exceptions:
            add_person(person_exceptions[name_scraper], fortune)

        for temporary_sqlname in sql_names:
            name_sql = normalize(temporary_sqlname)
            for perm in permutations(name_scraper.split(" ")):
                current_matching = matching_score(" ".join(perm), name_sql)

                if max_matching[0] < current_matching:
                    max_matching = (current_matching, temporary_sqlname)

        if max_matching[0] > 93:
            add_person(max_matching[1], fortune)

    models.db.session.commit()
Exemplo n.º 3
0
    def test_library_import(self):
        """
        Try importing a known good library archive, and verify that the
        contents of the library have completely replaced the old contents.
        """
        # Create some blocks to overwrite
        library = LibraryFactory.create(modulestore=self.store)
        lib_key = library.location.library_key
        test_block = ItemFactory.create(
            category="vertical",
            parent_location=library.location,
            user_id=self.user.id,
            publish_item=False,
        )
        test_block2 = ItemFactory.create(
            category="vertical",
            parent_location=library.location,
            user_id=self.user.id,
            publish_item=False
        )
        # Create a library and blocks that should remain unmolested.
        unchanged_lib = LibraryFactory.create()
        unchanged_key = unchanged_lib.location.library_key
        test_block3 = ItemFactory.create(
            category="vertical",
            parent_location=unchanged_lib.location,
            user_id=self.user.id,
            publish_item=False
        )
        test_block4 = ItemFactory.create(
            category="vertical",
            parent_location=unchanged_lib.location,
            user_id=self.user.id,
            publish_item=False
        )
        # Refresh library.
        library = self.store.get_library(lib_key)
        children = [self.store.get_item(child).url_name for child in library.children]
        self.assertEqual(len(children), 2)
        self.assertIn(test_block.url_name, children)
        self.assertIn(test_block2.url_name, children)

        unchanged_lib = self.store.get_library(unchanged_key)
        children = [self.store.get_item(child).url_name for child in unchanged_lib.children]
        self.assertEqual(len(children), 2)
        self.assertIn(test_block3.url_name, children)
        self.assertIn(test_block4.url_name, children)

        extract_dir = path(tempfile.mkdtemp(dir=settings.DATA_DIR))
        # the extract_dir needs to be passed as a relative dir to
        # import_library_from_xml
        extract_dir_relative = path.relpath(extract_dir, settings.DATA_DIR)

        try:
            with tarfile.open(path(TEST_DATA_DIR) / 'imports' / 'library.HhJfPD.tar.gz') as tar:
                safetar_extractall(tar, extract_dir)
            library_items = import_library_from_xml(
                self.store,
                self.user.id,
                settings.GITHUB_REPO_ROOT,
                [extract_dir_relative / 'library'],
                load_error_modules=False,
                static_content_store=contentstore(),
                target_id=lib_key
            )
        finally:
            shutil.rmtree(extract_dir)

        self.assertEqual(lib_key, library_items[0].location.library_key)
        library = self.store.get_library(lib_key)
        children = [self.store.get_item(child).url_name for child in library.children]
        self.assertEqual(len(children), 3)
        self.assertNotIn(test_block.url_name, children)
        self.assertNotIn(test_block2.url_name, children)

        unchanged_lib = self.store.get_library(unchanged_key)
        children = [self.store.get_item(child).url_name for child in unchanged_lib.children]
        self.assertEqual(len(children), 2)
        self.assertIn(test_block3.url_name, children)
        self.assertIn(test_block4.url_name, children)
Exemplo n.º 4
0
    def test_library_import(self):
        """
        Try importing a known good library archive, and verify that the
        contents of the library have completely replaced the old contents.
        """
        # Create some blocks to overwrite
        library = LibraryFactory.create(modulestore=self.store)
        lib_key = library.location.library_key
        test_block = ItemFactory.create(
            category="vertical",
            parent_location=library.location,
            user_id=self.user.id,
            publish_item=False,
        )
        test_block2 = ItemFactory.create(category="vertical",
                                         parent_location=library.location,
                                         user_id=self.user.id,
                                         publish_item=False)
        # Create a library and blocks that should remain unmolested.
        unchanged_lib = LibraryFactory.create()
        unchanged_key = unchanged_lib.location.library_key
        test_block3 = ItemFactory.create(
            category="vertical",
            parent_location=unchanged_lib.location,
            user_id=self.user.id,
            publish_item=False)
        test_block4 = ItemFactory.create(
            category="vertical",
            parent_location=unchanged_lib.location,
            user_id=self.user.id,
            publish_item=False)
        # Refresh library.
        library = self.store.get_library(lib_key)
        children = [
            self.store.get_item(child).url_name for child in library.children
        ]
        self.assertEqual(len(children), 2)
        self.assertIn(test_block.url_name, children)
        self.assertIn(test_block2.url_name, children)

        unchanged_lib = self.store.get_library(unchanged_key)
        children = [
            self.store.get_item(child).url_name
            for child in unchanged_lib.children
        ]
        self.assertEqual(len(children), 2)
        self.assertIn(test_block3.url_name, children)
        self.assertIn(test_block4.url_name, children)

        extract_dir = path(tempfile.mkdtemp(dir=settings.DATA_DIR))
        # the extract_dir needs to be passed as a relative dir to
        # import_library_from_xml
        extract_dir_relative = path.relpath(extract_dir, settings.DATA_DIR)

        try:
            with tarfile.open(
                    path(TEST_DATA_DIR) / 'imports' /
                    'library.HhJfPD.tar.gz') as tar:
                safetar_extractall(tar, extract_dir)
            library_items = import_library_from_xml(
                self.store,
                self.user.id,
                settings.GITHUB_REPO_ROOT, [extract_dir_relative / 'library'],
                load_error_modules=False,
                static_content_store=contentstore(),
                target_id=lib_key)
        finally:
            shutil.rmtree(extract_dir)

        self.assertEqual(lib_key, library_items[0].location.library_key)
        library = self.store.get_library(lib_key)
        children = [
            self.store.get_item(child).url_name for child in library.children
        ]
        self.assertEqual(len(children), 3)
        self.assertNotIn(test_block.url_name, children)
        self.assertNotIn(test_block2.url_name, children)

        unchanged_lib = self.store.get_library(unchanged_key)
        children = [
            self.store.get_item(child).url_name
            for child in unchanged_lib.children
        ]
        self.assertEqual(len(children), 2)
        self.assertIn(test_block3.url_name, children)
        self.assertIn(test_block4.url_name, children)