예제 #1
0
    def get_all_licenses_and_keywords(self, conf_path, text_path):
        ''' Get a list of licenses and keywords to be searched.
            keywords is a dictionary in form
            { 'keyword' : [ its locations ] }
        '''
        words_occurrences = defaultdict(lambda: set())
        words_frequencies = defaultdict(lambda: 0)
        licenses = self._load_licenses_from_configs(helper.get_files(conf_path))
        unconfigured_licenses = list( \
                set(helper.get_files(text_path)).difference(\
                set(itertools.chain(*[license.files for license in licenses]))))
        licenses += self._load_licenses_from_texts(unconfigured_licenses)

        for lic in licenses:
            for f in lic.cachedfiles:
                words = SingleLicenseLoader.get_words_from_license(f)
                self._merge_frequency_dicts(words_frequencies, words)
                for word in words.keys():
                    words_occurrences[word].add(f)


        keywords = dict((w, words_occurrences[w]) for w \
                in self._select_keywords(words_frequencies, words_occurrences))

        return licenses, keywords
예제 #2
0
 def load_directory(cls, path):
     '''Get files from a directory recursively
     :param path: Path to the directory
     '''
     filelist = set()
     if not os.path.isdir(path):
         raise IOError("{} is not a directory!".format(path))
     for filename in helper.get_files(path):
         logger.debug('Loading {}'.format(filename))
         if os.path.isdir(filename):
             filelist |= cls.load_directory(filename)
         elif cls.is_archive(filename):
             filelist |= cls.load_archive(filename)
         elif os.path.isfile(filename):
             filelist.add(cls.load_file(filename))
         else:
             logger.error('File does not exist or format not supported: {}'.format(filename))
     return filelist