Пример #1
0
    def scan(self, filePath):
        '''
    Read the content content of filename, extract the comments and preprocess them.
    Find the Damerau Levenshtein distance between the preprocessed file content
    and the license text.

    :param filePath: Path of the file to scan
    :return: Returns the license's short name with least damerau levenshtien distance
    '''
        processedData = super().loadFile(filePath)

        temp = exactMatcher(processedData, self.licenseList)
        if temp == -1:
            # Classify the license with minimum distance with scanned file
            globalDistance = sys.maxsize
            result = 0
            for idx in range(len(self.licenseList)):
                distance = damerau_levenshtein_distance(
                    processedData.split(" "),
                    self.licenseList.iloc[idx]['processed_text'].split(" "))
                if self.verbose > 0:
                    print(
                        str(idx) + "  " +
                        self.licenseList.iloc[idx]['shortname'] + "  " +
                        str(distance))
                if distance < globalDistance:
                    globalDistance = distance
                    result = idx

            return str(self.licenseList.iloc[result]['shortname'])
        else:
            return temp[0]
Пример #2
0
    def scan(self, filePath):
        '''
    Python Module to classify license using histogram similarity algorithm

    :param filePath: Input file path that needs to be scanned
    :return: License short name with maximum intersection with word frequency of licenses
    '''
        processedData = super().loadFile(filePath)
        if self.verbose > 0:
            print("PROCESSED DATA IS ", processedData)
            print("LICENSES[0]", str(self.licenseList.iloc[0]))

        temp = exactMatcher(processedData, self.licenseList)
        if temp == -1:
            # create array of frequency array of licenses
            licensesFrequency = []
            for idx in range(len(self.licenseList)):
                license = self.licenseList.at[idx, 'processed_text']
                licensesFrequency.append(
                    wordFrequency(re.findall(r'\b[a-z]{3,15}\b', license)))

            processedLicense = wordFrequency(
                re.findall(r'\b[a-z]{3,15}\b', processedData))

            if self.verbose > 0:
                print("Frequency array of licenses", licensesFrequency[0])
                print("Frequency table of input data", processedLicense)

            # Histogram Similarity Algorithm
            globalCount = 0
            result = 0
            for idx in range(len(licensesFrequency)):
                tempCount = 0
                for word, processedLicenseWordFreq in processedLicense.items():
                    licenseWordFreq = licensesFrequency[idx].get(word, 0)
                    if min(licenseWordFreq, processedLicenseWordFreq) > 0:
                        tempCount = tempCount + min(licenseWordFreq,
                                                    processedLicenseWordFreq)
                if self.verbose > 0:
                    print(idx, self.licenseList.at[idx, 'shortname'],
                          tempCount)
                if globalCount < tempCount:
                    result = idx
                    globalCount = tempCount
            if self.verbose > 0:
                print("Result is license with ID", result)
            return str(self.licenseList.at[result, 'shortname'])

        else:
            return temp