Python name_similarity示例，utils.osg.name_similarity Python示例

示例#1

0

显示文件

    def check_for_wikipedia_links(self):
        """
        Check the inspirations that haven't yet have a Wikipedia link in their Media field by searching for them on Wikipedia.
        """
        if not self.inspirations:
            print('inspirations not yet loaded')
            return
        for inspiration in self.inspirations.values():
            if 'Included' in inspiration:
                continue
            if 'Media' in inspiration and any(
                ('https://en.wikipedia.org/wiki/' in x
                 for x in inspiration['Media'])):
                continue
            name = inspiration['Name']
            # search in wikipedia
            results = osg_wikipedia.search(inspiration['Name'])

            # throw out all (disambiguation) pages
            results = [
                r for r in results
                if not any(x in r for x in ('disambiguation', 'film'))
            ]

            # throw out those too dissimilar
            results = [
                r for r in results if osg.name_similarity(
                    str.casefold(inspiration['Name']), str.casefold(r)) > 0.6
            ]

            # get pages for the remaining
            pages = osg_wikipedia.pages(results)

            # throw out those that are no video games
            pages = [
                page for page in pages if any('video games' in category
                                              for category in page.categories)
            ]

            # sort by similarity to title and only keep highest
            pages.sort(key=lambda page: osg.name_similarity(
                str.casefold(name), str.casefold(page.title)))
            pages = pages[:min(1, len(pages))]

            # if there is still one left, use it
            if pages:
                url = pages[0].url
                inspiration['Media'] = inspiration.get('Media', []) + [url]
                print('{} : {}'.format(name, url))
        print('finished checking for Wikipedia links')

示例#2

0

显示文件

文件： maintenance_developers.py 项目： mdtrooper/opensourcegames

 def check_for_duplicates(self):
     if not self.developers:
         print('developers not yet loaded')
         return
     developer_names = list(self.developers.keys())
     for index, name in enumerate(developer_names):
         for other_name in developer_names[index + 1:]:
             if osg.name_similarity(name, other_name) > 0.8:
                 print(' {} - {} is similar'.format(name, other_name))
     print('duplicates checked')

示例#3

0

显示文件

 def check_for_duplicates(self):
     if not self.developers:
         print('developers not yet loaded')
         return
     start_time = time.process_time()
     developer_names = list(self.developers.keys())
     for index, name in enumerate(developer_names):
         for other_name in developer_names[index + 1:]:
             if osg.name_similarity(str.casefold(name), str.casefold(other_name)) > 0.85:
                 print(' {} - {} is similar'.format(name, other_name))
     print('duplicates checked (took {:.1f}s)'.format(time.process_time()-start_time))

示例#4

0

显示文件

文件： maintenance_inspirations.py 项目： mdtrooper/opensourcegames

 def check_for_duplicates(self):
     if not self.inspirations:
         print('inspirations not yet loaded')
         return
     inspiration_names = list(self.inspirations.keys())
     for index, name in enumerate(inspiration_names):
         for other_name in inspiration_names[index + 1:]:
             if any((name.startswith(x) and other_name.startswith(x) for x in valid_duplicates)):
                 continue
             if osg.name_similarity(name, other_name) > 0.8:
                 print(' {} - {} is similar'.format(name, other_name))
     print('duplicates checked')

示例#5

0

显示文件

文件： maintenance_inspirations.py 项目： minthantsin/opensourcegames

 def check_for_duplicates(self):
     if not self.inspirations:
         print('inspirations not yet loaded')
         return
     start_time = time.process_time()
     inspiration_names = list(self.inspirations.keys())
     for index, name in enumerate(inspiration_names):
         for other_name in inspiration_names[index + 1:]:
             if any((name.startswith(x) and other_name.startswith(x)
                     for x in valid_duplicates)):
                 continue
             if osg.name_similarity(str.casefold(name),
                                    str.casefold(other_name)) > 0.9:
                 print(' {} - {} is similar'.format(name, other_name))
     print('duplicates checked took {:.1f}s'.format(time.process_time() -
                                                    start_time))

示例#6

0

显示文件

    def check_inconsistencies(self):
        """

        :return:
        """
        if not self.entries:
            print('entries not yet loaded')
            return
        # get all keywords and print similar keywords
        keywords = []
        for entry in self.entries:
            keywords.extend(entry['Keyword'])
            if b'first\xe2\x80\x90person'.decode() in entry['Keyword']:
                print(entry['File'])
        keywords = [x.value for x in keywords]

        # reduce those starting with "multiplayer"
        keywords = [
            x if not x.startswith('multiplayer') else 'multiplayer'
            for x in keywords
        ]

        # check unique keywords
        unique_keywords = list(set(keywords))
        unique_keywords_counts = [keywords.count(l) for l in unique_keywords]
        for index, name in enumerate(unique_keywords):
            for other_index in range(index + 1, len(unique_keywords)):
                other_name = unique_keywords[other_index]
                if osg.name_similarity(name, other_name) > 0.8:
                    print(' Keywords {} ({}) - {} ({}) are similar'.format(
                        name, unique_keywords_counts[index], other_name,
                        unique_keywords_counts[other_index]))

        # get all names of frameworks and library also using osg.code_dependencies_aliases
        valid_dependencies = list(
            c.general_code_dependencies_without_entry.keys())
        for entry in self.entries:
            if any((x in ('framework', 'library', 'game engine')
                    for x in entry['Keyword'])):
                name = entry['Title']
                if name in c.code_dependencies_aliases:
                    valid_dependencies.extend(
                        c.code_dependencies_aliases[name])
                else:
                    valid_dependencies.append(name)

        # get all referenced code dependencies
        referenced_dependencies = {}
        for entry in self.entries:
            deps = entry.get('Code dependency', [])
            for dependency in deps:
                dependency = dependency.value
                if dependency in referenced_dependencies:
                    referenced_dependencies[dependency] += 1
                else:
                    referenced_dependencies[dependency] = 1

        # delete those that are valid dependencies
        referenced_dependencies = [(k, v)
                                   for k, v in referenced_dependencies.items()
                                   if k not in valid_dependencies]

        # sort by number
        referenced_dependencies.sort(key=lambda x: x[1], reverse=True)

        # print out
        print('Code dependencies not included as entry')
        for dep in referenced_dependencies:
            print('{} ({})'.format(*dep))

        # if there is the "Play" field, it should have "Web" as Platform
        for entry in self.entries:
            name = entry['File']
            if 'Play' in entry:
                if not 'Platform' in entry:
                    print(
                        'Entry "{}" has "Play" field but not "Platform" field, add it with "Web"'
                        .format(name))
                elif not 'Web' in entry['Platform']:
                    print(
                        'Entry "{}" has "Play" field but not "Web" in "Platform" field'
                        .format(name))

        # javascript/typescript/php as language but not web as platform?
        ignored = ('0_ad.md', 'aussenposten.md', 'between.md', 'caesaria.md',
                   'cavepacker.md', 'citybound.md', 'gorillas.md', 'ika.md',
                   'inexor.md', 'maniadrive.md', 'oolite.md', 'freevikings.md',
                   'rolisteam.md', 'rpgboss.md', 'ruby-warrior.md',
                   'snelps.md', 'tenes_empanadas_graciela.md', 'thrive.md')
        for entry in self.entries:
            name = entry['File']
            if name in ignored:
                continue
            if any(language in entry['Code language'] for language in (
                    'JavaScript', 'TypeScript', 'PHP',
                    'CoffeeScript')) and ('Platform' not in entry
                                          or 'Web' not in entry['Platform']):
                print(
                    'Entry "{}" has language JavaScript/PHP but not Web as platform.'
                    .format(name))

        # space in name but not space as keyword
        ignored = ('burgerspace.md', 'crystal_space_3d_sdk.md',
                   'our_personal_space.md', 'space_harrier_clone.md')
        for entry in self.entries:
            name = entry['File']
            if name in ignored:
                continue
            title = entry['Title']
            if 'space' in title.lower() and not 'space' in entry['Keyword']:
                print(
                    'Entry "{}" has space in name but not as keyword.'.format(
                        name))

        # starts with j + capital letter but not java as language
        for entry in self.entries:
            name = entry['File']
            title = entry['Title']
            if title[0] == 'j' and title[1] == title[1].upper(
            ) and not 'Java' in entry['Code language']:
                print(
                    'Entry "{}" title starts with j? but Java is not a code language.'
                    .format(name))

        # search for duplicate keywords
        for entry in self.entries:
            keywords = entry['Keyword']
            duplicates = [
                keyword for keyword in keywords if keywords.count(keyword) > 1
            ]
            if duplicates:
                print('"{}" has duplicate keywords: {}'.format(
                    entry['File'], duplicates))

示例#7

0

显示文件

文件： osgameclones_synchronization.py 项目： q4a/opensourcegames

    print('osgc-framework: {}'.format(unique_field_contents(osgc_entries, 'framework')))
    print('osgc-content: {}'.format(unique_field_contents(osgc_entries, 'content')))

    # just the names
    osgc_names = set([x['name'] for x in osgc_entries])
    our_names = set([x['Title'] for x in our_entries])
    common_names = osgc_names & our_names
    osgc_names -= common_names
    our_names -= common_names
    print('{} both, {} only osgameclones, {} only us'.format(len(common_names), len(osgc_names), len(our_names)))
    # find similar names among the rest
    if check_similar_names:
        print('look for similar names (theirs - ours)')
        for osgc_name in osgc_names:
           for our_name in our_names:
               if osg.name_similarity(osgc_name, our_name) > similarity_threshold:
                   print(' {} - {}'.format(osgc_name, our_name))

    newly_created_entries = 0
    # iterate over their entries
    for osgc_entry in osgc_entries:
        osgc_name = osgc_entry['name']

        is_included = False
        for our_entry in our_entries:
            our_name = our_entry['Title']

            # find those that entries in osgameclones that are also in our database and compare them
            if osgc_name == our_name:
                is_included = True
                # a match, check the fields