Пример #1
0
    def __init__(self,
                 config: Optional[Config] = None,
                 progress_callback: ProgressCallback = lambda stage, progress: None):
        if config is not None:
            self.config = {**self.config, **config}

        self.progress_callback = progress_callback
        self.__normalized_levenshtein = NormalizedLevenshtein()
Пример #2
0
def universal_search(title: str, artists: str) -> List[Song]:  # pragma: no cover
    songs = []
    songs.extend(qq_music_search(title, artists))
    songs.extend(netease_music_search(title, artists))

    songs = sorted(
        songs,
        key=lambda s: (
            NormalizedLevenshtein().similarity(s.title, s.target_title)
            + NormalizedLevenshtein().similarity(s.artists, s.target_artists),
        ),
        reverse=True,
    )

    return songs
Пример #3
0
class BasicFrameProcessor(FrameProcessor):
    prev_title: str = None
    normalized_levenshtein = NormalizedLevenshtein()
    similarity_treshold: int

    def __init__(self, similarity_treshold):
        self.similarity_treshold = similarity_treshold

    def process_frame(self, frame: int, title: str,
                      text: str) -> Optional[VideoIndexEntry]:
        entry: VideoIndexEntry = None

        if self.prev_title and title:
            similarity = self.normalized_levenshtein.similarity(
                self.prev_title, title)

            if similarity < self.similarity_treshold:
                entry = {'second': frame, 'title': title, 'text': text}

        elif not self.prev_title and title:
            entry = {'second': frame, 'title': title, 'text': text}

        if title:
            self.prev_title = title

        return entry
Пример #4
0
def similarity_function(similarity_measure):
    if similarity_measure == 'exact':
        return exact_similarity
    if similarity_measure == 'mlcs':
        return lambda s1, s2: 1 - MetricLCS().distance(s1, s2)
    elif similarity_measure == 'nlevs':
        return lambda s1, s2: 1 - NormalizedLevenshtein().distance(s1, s2)
    elif similarity_measure == 'jaro':
        return JaroWinkler().similarity
    else:
        raise ValueError('Invalid similarity measure.')
Пример #5
0
    def requestMatches(self, data, request):

        if data == None or request == None:
            return data == None and request == None

        requestLength = len(request)
        dataLength = len(data)

        if (requestLength > 25 or dataLength > 25): # do not perform distance ignorance if payload is short - a five byte payload may match even with a 20% length difference
            if (requestLength < (0.9 * dataLength)): # if request is much shorter than data
                return False
            if (dataLength < (0.9 * requestLength)): # if data is much shorter than request            
                return False

        matcher = NormalizedLevenshtein()
        score = matcher.similarity(data, request)
        print(request)
        print(data)
        print(score)

        return score > 0.8
Пример #6
0
class TOCProcessor(FrameProcessor):
    toc: TableOfContents
    current_slide: int = 0

    normalized_levenshtein = NormalizedLevenshtein()
    similarity_treshold: int

    def __init__(self, toc: TableOfContents, similarity_treshold: int):
        self.toc = toc
        self.similarity_treshold = similarity_treshold

    def process_frame(self, frame: int, title: str,
                      text: str) -> Optional[VideoIndexEntry]:
        if title and self.current_slide < len(self.toc):
            similarity, expected_title = self.__similarity(
                title, self.current_slide)

            if similarity >= self.similarity_treshold:
                entry: VideoIndexEntry = {
                    'second': frame,
                    'title': expected_title,
                    'text': text
                }
                self.current_slide += 1
                return entry

            # Check also the next slide in case the current one has been missed
            if self.current_slide + 1 >= len(self.toc):
                similarity, expected_title = self.__similarity(
                    title, self.current_slide + 1)

                if similarity >= self.similarity_treshold:
                    entry: VideoIndexEntry = {
                        'second': frame,
                        'title': expected_title,
                        'text': text
                    }
                    self.current_slide += 2
                    return entry

        return None

    def __similarity(self, title: str, slide_index: int) -> Tuple[float, str]:
        expected_title: str = self.toc[slide_index]['title']
        similarity = self.normalized_levenshtein.similarity(
            expected_title, title)

        return similarity, expected_title
Пример #7
0
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
normalized_levenshtein = NormalizedLevenshtein()


class TrieNode():
    def __init__(self):
        self.children = {}
        self.isLast = False


class Trie():
    def __init__(self):
        self.root = TrieNode()
        self.matching_word = {}
        self.suggested_words = []

    def formTrie(self, words):
        for word in words:
            self.insert(word)

    @staticmethod
    def childAndParent(letter, current_node):
        return current_node.children.get(letter)

    def insert(self, word):
        current_node = self.root

        for letter in list(word):
            if not self.childAndParent(letter, current_node):
                current_node.children[letter] = TrieNode()
            current_node = current_node.children[letter]
Пример #8
0
class BaseDocumentService:
    """
    A base class for common usage among the different supported documents with a common pipeline,
    which is `cleaner`, `valid_text` and `process_text`
    """

    # similarity method
    # can be changed in subclassing for specific differences
    SIM_METHOD = NormalizedLevenshtein()

    # patterns to make final clean
    PATTERNS = {}

    # keys to find and values to skip to get the corresponding
    # in the non-abstract class implementation fill with the
    # corresponding assumptions
    TO_FIND = {}

    # pylint: disable=unused-argument,no-self-use
    # disable linting of unused argument of self and other args as they are later used by subclasses
    def cleaner(self, text: str) -> list:
        """
        Cleans the recieved text from non desired characters

        Args:
            text (str): desired text to be cleaned

        Returns:
            list: list of the, apparent, separation of lines with the cleaning and reading
        """
        # normalize spaces and removes non standar characters
        text = re.sub(r'\s{2,}|[^\w\\\s.-]', '\n', text).upper()

        # removes unnecessary \ or other characters excepting \n
        text = re.sub(r'[^a-zA-Z0-9\s.-]', '', text)

        # split based on enter and filter of white spaces
        text_lines = map(lambda txt: txt.strip(), text.split('\n'))
        text_lines = list(
            filter(lambda txt: (txt != '') or (not re.match(r'\s*', txt)),
                   text_lines))
        return text_lines

    def _valid_similarity(self,
                          find: str,
                          compare: str,
                          threshold=0.75) -> bool:
        return self.SIM_METHOD.similarity(find, compare) >= threshold

    def _associate(self, text_list: list, threshold=0.75) -> dict:
        """Specific for each document reading implementation"""
        return dict()

    def _valid_association(self, associations: dict) -> bool:
        """Specific for each document reading implementation"""
        return False

    def _clean_processed_text(self, associations: dict) -> dict:
        """Specific for each document reading implementation"""
        return dict()

    def _standarize_return(self, associations: dict) -> dict:
        """Standarizes to lowercase the return keys"""
        return {
            key.lower().replace(' ', '_'): value
            for key, value in associations.items()
        }

    def valid_text(self, text: str, threshold=0.75) -> bool:
        """
        Validates that a text has the specific document format and all the associations specified according
        to the specified document type (class) in use

        Args:
            text (str): text to validate
            threshold (optional)(int/float): threshold to use when validating the similarity of the search for key words

        Returns
            bool: True if text is valid accordind to specified document type, False otherwise
        """
        text_lines = self.cleaner(text)
        associations = self._associate(text_lines, threshold=threshold)
        return self._valid_association(associations)

    def process_text(self, text: str, threshold=0.75) -> dict:
        """
        Processes the desired text and formats it into a dictionary according to the specified
        document type (class) in use

        Args:
            text (str): text to validate
            threshold (optional)(int/float): threshold to use when validating the similarity of the search for key words

        Returns:
            dict: dictionary of document specified associations if valid text, None otherwise
        """
        text_lines = self.cleaner(text)
        associations = self._associate(text_lines, threshold=threshold)
        if self._valid_association(associations):
            associations = self._clean_processed_text(associations)
            return self._standarize_return(associations)
        return None
Пример #9
0
def lexical_similarity(w1, w2):
    normalized_levenshtein = NormalizedLevenshtein()
    return normalized_levenshtein.similarity(w1, w2)
Пример #10
0
class kb2():
    sheetNameList = ['开出', '开入', '匹配']
    levenshtein = Levenshtein()
    normalized_levenshtein = NormalizedLevenshtein()
    funcList = [levenshtein, normalized_levenshtein]

    def __init__(self,
                 matchList: [Match] = [Match],
                 outPortList: [] = [],
                 inPortList: [] = []):
        self.matchList = matchList
        self.portListDict = {str: []}
        self.portDict = {str: Port}
        self.dfDict = {}
        for sheetName in self.sheetNameList:
            self.dfDict[sheetName] = DataFrame()  # in,out,match

    def learn_folder(self,
                     path2folder='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元'):
        for filename in glob.iglob(path2folder + '**/*.xls', recursive=True):
            if filename.endswith(".xls") or filename.endswith(".csv"):
                self.learn_excel(filename)
            else:
                continue

    def learn_excel(self, path2excel):
        self.load_excel(path2excel, sheetName='已配置', inOut='开出')
        self.load_excel(path2excel, sheetName='已配置', inOut='开入')
        sheet = pd.ExcelFile(path2excel).parse('已配置')
        try:
            for row in sheet.iterrows():
                outPort = Port(row[1]['开出端子描述'], row[1]['开出端子引用'])
                inPort = Port(row[1]['开入端子描述'], row[1]['开入端子引用'])
                match = Match(outPort, inPort)
                self.matchList.append(match)
                global df
                df = self.dfDict.get('匹配', DataFrame())
                key2 = row[1]['开出端子描述'] + row[1]['开出端子引用'] + '匹配' + row[1][
                    '开入端子描述'] + row[1]['开入端子引用']
                df[key2] = df.get(key2)
                self.dfDict['匹配'] = df
        except RuntimeError:
            print(row[1])

    def load_excel(
            self,
            path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls',
            sheetName='所有发送',
            inOut='开出'):
        sheet = pd.ExcelFile(path2excel).parse(sheetName)
        key: str = path2excel + sheetName + inOut
        portList = self.portListDict.get(key, [])
        try:
            for row in sheet.iterrows():
                port = Port(row[1][inOut + '端子描述'], row[1][inOut + '端子引用'])
                # print(vars(port))
                portList.append(port)
                key2 = row[1][inOut + '端子描述'] + inOut + row[1][inOut + '端子引用']
                self.portDict[key2] = port
                global df
                df = self.dfDict.get(inOut, DataFrame())
                if sheetName == '已配置':
                    df[key2] = df.get(key2)
                else:  # new
                    if key2 not in df.index:
                        df = df.reindex(df.index.tolist() + [key2])
                        for done in df:
                            # for function in strsimpy.functions:
                            df[done][key2] = self.levenshtein.distance(
                                done, key2)
                self.dfDict[inOut] = df
            self.portListDict[key] = portList
        except RuntimeError:
            print(row[1])

    def load_test(
            self,
            path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls'):
        self.load_excel(path2excel, sheetName='所有发送', inOut='开出')
        self.load_excel(path2excel, sheetName='所有接收', inOut='开入')
        global df
        df = self.dfDict.get('匹配', DataFrame())
        for outPort in self.portListDict[path2excel + '所有发送' + '开出']:
            for inPort in self.portListDict[path2excel + '所有接收' + '开入']:
                # print(vars(inPort))
                key2 = outPort.description + outPort.reference + '匹配' + inPort.description + inPort.reference

                self.dfDict['匹配'] = self.distance(key2, df, '匹配')

    def distance(self, key2, df, inOut):
        if key2 not in df.index:
            df = df.reindex(df.index.tolist() + [key2])
            for done in df:
                similarity = self.levenshtein.distance(done, key2)
                df[done][key2] = similarity
                # if similarity<0.03:
                #     print(done+"like"+key2)
        self.dfDict[inOut] = df
        return df

    def main(self,
             path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/kb2.xlsx'):
        # for sheetName in self.sheetNameList:
        #     self.dfDict[sheetName] = pd.ExcelFile(path2excel).parse(sheetName)  # load history
        start_time = time.time()

        self.learn_excel('..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls')
        self.load_test()
        with pd.ExcelWriter(path2excel) as writer:
            for key, df in self.dfDict.items():
                print(key, df)
                df.to_excel(writer, sheet_name=key)
        print("--- %s m ---" % ((time.time() - start_time) / 60))

    def transform(self, multilevelDict):
        return {
            str(key).replace("\n", ""):
            (self.transform(value) if isinstance(value, dict) else value)
            for key, value in multilevelDict.items()
        }
Пример #11
0
class LectureVideoIndexer:
    config: Config = {
        'frame_step': 2,
        'image_similarity_threshold': 0.9,
        'text_similarity_threshold': 0.85,
        'hash_size': 16,
    }
    progress_callback: ProgressCallback

    __normalized_levenshtein = None

    def __init__(self,
                 config: Optional[Config] = None,
                 progress_callback: ProgressCallback = lambda stage, progress: None):
        if config is not None:
            self.config = {**self.config, **config}

        self.progress_callback = progress_callback
        self.__normalized_levenshtein = NormalizedLevenshtein()

    def index(
        self,
        video_path: os.PathLike,
        skip_converting: bool = False,
        crop_region: CropRegion = None,
        toc: TableOfContents = None,
    ) -> VideoIndex:
        if not skip_converting:
            self.__clean()
            self.__convert_to_frames(video_path, crop_region=crop_region)

        _, _, frames = next(os.walk(FRAMES_DIR))
        filtered_frames = self.__filter_similar_frames(frames_count=len(frames))

        processor = BasicFrameProcessor(
            self.config['text_similarity_threshold']) if toc is None else TOCProcessor(
                toc, self.config['text_similarity_threshold'])
        index = self.__process_frames(filtered_frames, processor)

        return index

    def __clean(self):
        dirpath = Path(FRAMES_DIR)

        if dirpath.exists() and dirpath.is_dir():
            shutil.rmtree(dirpath)
        dirpath.mkdir(parents=True, exist_ok=True)

    def __convert_to_frames(self, video_path: os.PathLike, crop_region: CropRegion = None):
        converter = VideoConverter(
            self.config['frame_step'],
            progress_callback=lambda progress: self.progress_callback(Stage.CONVERTING, progress))

        converter.convert_to_frames(video_path=video_path, crop_region=crop_region)

    def __filter_similar_frames(self, frames_count: int) -> [int]:
        filtered_frames: [int] = [0]
        prev_frame = 0
        max_frame = frames_count * self.config['frame_step']
        prev_progress = 0

        for frame in range(self.config['frame_step'], max_frame, self.config['frame_step']):
            frame_path = self.__create_frame_path(frame)
            similarity = self.__compare_images(self.__create_frame_path(prev_frame), frame_path)

            if (similarity < self.config['image_similarity_threshold']):
                filtered_frames.append(frame)
            prev_frame = frame

            progress = round((frame + 1) / max_frame * 100)
            if progress > prev_progress:
                self.progress_callback(Stage.FILTERING, progress)
                prev_progress = progress

        return filtered_frames

    def __compare_images(self, img_path_a: os.PathLike, img_path_b: os.PathLike) -> float:
        hash_a = imagehash.phash(Image.open(img_path_a), hash_size=self.config['hash_size'])
        hash_b = imagehash.phash(Image.open(img_path_b), hash_size=self.config['hash_size'])

        return self.__normalized_levenshtein.similarity(str(hash_a), str(hash_b))

    def __process_frames(self, frames: [int], processor: FrameProcessor) -> VideoIndex:
        index: VideoIndex = []
        prev_progress = 0

        for i in range(len(frames)):
            frame = frames[i]
            frame_path = self.__create_frame_path(frame)
            image = self.__preprocess_image(frame_path)

            text = pytesseract.image_to_string(image)
            title, text = self.__extract_title(text)

            entry = processor.process_frame(frame, title, text)

            if entry:
                index.append(entry)

            progress = round(((i + 1) * 100) / len(frames))
            if progress > prev_progress:
                self.progress_callback(Stage.PROCESSING, progress)
                prev_progress = progress

        return index

    def __preprocess_image(self, path: os.PathLike):
        img = cv.imread(path)

        img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
        img = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2)
        img = cv.medianBlur(img, 3)

        return img

    def __extract_title(self, text) -> Tuple[str, str]:
        lines = [line for line in text.strip().split('\n') if not line.isspace() and len(line) > 1]
        title = lines[0] if lines else None
        text = '\n'.join(lines[1:])

        return title, text

    def __create_frame_path(self, frame) -> str:
        return os.path.join(FRAMES_DIR, f"{FRAME_PREFIX}{frame}.png")
Пример #12
0
fourgram = NGram(4)
print(fourgram.distance(s1, s2))

jarowinkler = JaroWinkler()
print(jarowinkler.similarity('My string', 'My tsring'))
print(jarowinkler.similarity('My string', 'My ntrisg'))

optimal_string_alignment = OptimalStringAlignment()
print(optimal_string_alignment.distance('CA', 'ABC'))

damerau = Damerau()
print(damerau.distance('ABCDEF', 'ABDCEF'))
print(damerau.distance('ABCDEF', 'BACDFE'))
print(damerau.distance('ABCDEF', 'ABCDE'))
print(damerau.distance('ABCDEF', 'BCDEF'))
print(damerau.distance('ABCDEF', 'ABCGDEF'))
print(damerau.distance('ABCDEF', 'POIU'))

normalized_levenshtein = NormalizedLevenshtein()
print(normalized_levenshtein.distance('My string', 'My $string'))
print(normalized_levenshtein.distance('My string', 'My $string'))
print(normalized_levenshtein.distance('My string', 'My $string'))

print(normalized_levenshtein.similarity('My string', 'My $string'))
print(normalized_levenshtein.similarity('My string', 'My $string'))
print(normalized_levenshtein.similarity('My string', 'My $string'))

levenshtein = Levenshtein()
print(levenshtein.distance('My string', 'My $string'))
print(levenshtein.distance('My string', 'My $string'))
print(levenshtein.distance('My string', 'My $string'))