def __init__(self, config: Optional[Config] = None, progress_callback: ProgressCallback = lambda stage, progress: None): if config is not None: self.config = {**self.config, **config} self.progress_callback = progress_callback self.__normalized_levenshtein = NormalizedLevenshtein()
def universal_search(title: str, artists: str) -> List[Song]: # pragma: no cover songs = [] songs.extend(qq_music_search(title, artists)) songs.extend(netease_music_search(title, artists)) songs = sorted( songs, key=lambda s: ( NormalizedLevenshtein().similarity(s.title, s.target_title) + NormalizedLevenshtein().similarity(s.artists, s.target_artists), ), reverse=True, ) return songs
class BasicFrameProcessor(FrameProcessor): prev_title: str = None normalized_levenshtein = NormalizedLevenshtein() similarity_treshold: int def __init__(self, similarity_treshold): self.similarity_treshold = similarity_treshold def process_frame(self, frame: int, title: str, text: str) -> Optional[VideoIndexEntry]: entry: VideoIndexEntry = None if self.prev_title and title: similarity = self.normalized_levenshtein.similarity( self.prev_title, title) if similarity < self.similarity_treshold: entry = {'second': frame, 'title': title, 'text': text} elif not self.prev_title and title: entry = {'second': frame, 'title': title, 'text': text} if title: self.prev_title = title return entry
def similarity_function(similarity_measure): if similarity_measure == 'exact': return exact_similarity if similarity_measure == 'mlcs': return lambda s1, s2: 1 - MetricLCS().distance(s1, s2) elif similarity_measure == 'nlevs': return lambda s1, s2: 1 - NormalizedLevenshtein().distance(s1, s2) elif similarity_measure == 'jaro': return JaroWinkler().similarity else: raise ValueError('Invalid similarity measure.')
def requestMatches(self, data, request): if data == None or request == None: return data == None and request == None requestLength = len(request) dataLength = len(data) if (requestLength > 25 or dataLength > 25): # do not perform distance ignorance if payload is short - a five byte payload may match even with a 20% length difference if (requestLength < (0.9 * dataLength)): # if request is much shorter than data return False if (dataLength < (0.9 * requestLength)): # if data is much shorter than request return False matcher = NormalizedLevenshtein() score = matcher.similarity(data, request) print(request) print(data) print(score) return score > 0.8
class TOCProcessor(FrameProcessor): toc: TableOfContents current_slide: int = 0 normalized_levenshtein = NormalizedLevenshtein() similarity_treshold: int def __init__(self, toc: TableOfContents, similarity_treshold: int): self.toc = toc self.similarity_treshold = similarity_treshold def process_frame(self, frame: int, title: str, text: str) -> Optional[VideoIndexEntry]: if title and self.current_slide < len(self.toc): similarity, expected_title = self.__similarity( title, self.current_slide) if similarity >= self.similarity_treshold: entry: VideoIndexEntry = { 'second': frame, 'title': expected_title, 'text': text } self.current_slide += 1 return entry # Check also the next slide in case the current one has been missed if self.current_slide + 1 >= len(self.toc): similarity, expected_title = self.__similarity( title, self.current_slide + 1) if similarity >= self.similarity_treshold: entry: VideoIndexEntry = { 'second': frame, 'title': expected_title, 'text': text } self.current_slide += 2 return entry return None def __similarity(self, title: str, slide_index: int) -> Tuple[float, str]: expected_title: str = self.toc[slide_index]['title'] similarity = self.normalized_levenshtein.similarity( expected_title, title) return similarity, expected_title
from strsimpy.normalized_levenshtein import NormalizedLevenshtein normalized_levenshtein = NormalizedLevenshtein() class TrieNode(): def __init__(self): self.children = {} self.isLast = False class Trie(): def __init__(self): self.root = TrieNode() self.matching_word = {} self.suggested_words = [] def formTrie(self, words): for word in words: self.insert(word) @staticmethod def childAndParent(letter, current_node): return current_node.children.get(letter) def insert(self, word): current_node = self.root for letter in list(word): if not self.childAndParent(letter, current_node): current_node.children[letter] = TrieNode() current_node = current_node.children[letter]
class BaseDocumentService: """ A base class for common usage among the different supported documents with a common pipeline, which is `cleaner`, `valid_text` and `process_text` """ # similarity method # can be changed in subclassing for specific differences SIM_METHOD = NormalizedLevenshtein() # patterns to make final clean PATTERNS = {} # keys to find and values to skip to get the corresponding # in the non-abstract class implementation fill with the # corresponding assumptions TO_FIND = {} # pylint: disable=unused-argument,no-self-use # disable linting of unused argument of self and other args as they are later used by subclasses def cleaner(self, text: str) -> list: """ Cleans the recieved text from non desired characters Args: text (str): desired text to be cleaned Returns: list: list of the, apparent, separation of lines with the cleaning and reading """ # normalize spaces and removes non standar characters text = re.sub(r'\s{2,}|[^\w\\\s.-]', '\n', text).upper() # removes unnecessary \ or other characters excepting \n text = re.sub(r'[^a-zA-Z0-9\s.-]', '', text) # split based on enter and filter of white spaces text_lines = map(lambda txt: txt.strip(), text.split('\n')) text_lines = list( filter(lambda txt: (txt != '') or (not re.match(r'\s*', txt)), text_lines)) return text_lines def _valid_similarity(self, find: str, compare: str, threshold=0.75) -> bool: return self.SIM_METHOD.similarity(find, compare) >= threshold def _associate(self, text_list: list, threshold=0.75) -> dict: """Specific for each document reading implementation""" return dict() def _valid_association(self, associations: dict) -> bool: """Specific for each document reading implementation""" return False def _clean_processed_text(self, associations: dict) -> dict: """Specific for each document reading implementation""" return dict() def _standarize_return(self, associations: dict) -> dict: """Standarizes to lowercase the return keys""" return { key.lower().replace(' ', '_'): value for key, value in associations.items() } def valid_text(self, text: str, threshold=0.75) -> bool: """ Validates that a text has the specific document format and all the associations specified according to the specified document type (class) in use Args: text (str): text to validate threshold (optional)(int/float): threshold to use when validating the similarity of the search for key words Returns bool: True if text is valid accordind to specified document type, False otherwise """ text_lines = self.cleaner(text) associations = self._associate(text_lines, threshold=threshold) return self._valid_association(associations) def process_text(self, text: str, threshold=0.75) -> dict: """ Processes the desired text and formats it into a dictionary according to the specified document type (class) in use Args: text (str): text to validate threshold (optional)(int/float): threshold to use when validating the similarity of the search for key words Returns: dict: dictionary of document specified associations if valid text, None otherwise """ text_lines = self.cleaner(text) associations = self._associate(text_lines, threshold=threshold) if self._valid_association(associations): associations = self._clean_processed_text(associations) return self._standarize_return(associations) return None
def lexical_similarity(w1, w2): normalized_levenshtein = NormalizedLevenshtein() return normalized_levenshtein.similarity(w1, w2)
class kb2(): sheetNameList = ['开出', '开入', '匹配'] levenshtein = Levenshtein() normalized_levenshtein = NormalizedLevenshtein() funcList = [levenshtein, normalized_levenshtein] def __init__(self, matchList: [Match] = [Match], outPortList: [] = [], inPortList: [] = []): self.matchList = matchList self.portListDict = {str: []} self.portDict = {str: Port} self.dfDict = {} for sheetName in self.sheetNameList: self.dfDict[sheetName] = DataFrame() # in,out,match def learn_folder(self, path2folder='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元'): for filename in glob.iglob(path2folder + '**/*.xls', recursive=True): if filename.endswith(".xls") or filename.endswith(".csv"): self.learn_excel(filename) else: continue def learn_excel(self, path2excel): self.load_excel(path2excel, sheetName='已配置', inOut='开出') self.load_excel(path2excel, sheetName='已配置', inOut='开入') sheet = pd.ExcelFile(path2excel).parse('已配置') try: for row in sheet.iterrows(): outPort = Port(row[1]['开出端子描述'], row[1]['开出端子引用']) inPort = Port(row[1]['开入端子描述'], row[1]['开入端子引用']) match = Match(outPort, inPort) self.matchList.append(match) global df df = self.dfDict.get('匹配', DataFrame()) key2 = row[1]['开出端子描述'] + row[1]['开出端子引用'] + '匹配' + row[1][ '开入端子描述'] + row[1]['开入端子引用'] df[key2] = df.get(key2) self.dfDict['匹配'] = df except RuntimeError: print(row[1]) def load_excel( self, path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls', sheetName='所有发送', inOut='开出'): sheet = pd.ExcelFile(path2excel).parse(sheetName) key: str = path2excel + sheetName + inOut portList = self.portListDict.get(key, []) try: for row in sheet.iterrows(): port = Port(row[1][inOut + '端子描述'], row[1][inOut + '端子引用']) # print(vars(port)) portList.append(port) key2 = row[1][inOut + '端子描述'] + inOut + row[1][inOut + '端子引用'] self.portDict[key2] = port global df df = self.dfDict.get(inOut, DataFrame()) if sheetName == '已配置': df[key2] = df.get(key2) else: # new if key2 not in df.index: df = df.reindex(df.index.tolist() + [key2]) for done in df: # for function in strsimpy.functions: df[done][key2] = self.levenshtein.distance( done, key2) self.dfDict[inOut] = df self.portListDict[key] = portList except RuntimeError: print(row[1]) def load_test( self, path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls'): self.load_excel(path2excel, sheetName='所有发送', inOut='开出') self.load_excel(path2excel, sheetName='所有接收', inOut='开入') global df df = self.dfDict.get('匹配', DataFrame()) for outPort in self.portListDict[path2excel + '所有发送' + '开出']: for inPort in self.portListDict[path2excel + '所有接收' + '开入']: # print(vars(inPort)) key2 = outPort.description + outPort.reference + '匹配' + inPort.description + inPort.reference self.dfDict['匹配'] = self.distance(key2, df, '匹配') def distance(self, key2, df, inOut): if key2 not in df.index: df = df.reindex(df.index.tolist() + [key2]) for done in df: similarity = self.levenshtein.distance(done, key2) df[done][key2] = similarity # if similarity<0.03: # print(done+"like"+key2) self.dfDict[inOut] = df return df def main(self, path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/kb2.xlsx'): # for sheetName in self.sheetNameList: # self.dfDict[sheetName] = pd.ExcelFile(path2excel).parse(sheetName) # load history start_time = time.time() self.learn_excel('..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls') self.load_test() with pd.ExcelWriter(path2excel) as writer: for key, df in self.dfDict.items(): print(key, df) df.to_excel(writer, sheet_name=key) print("--- %s m ---" % ((time.time() - start_time) / 60)) def transform(self, multilevelDict): return { str(key).replace("\n", ""): (self.transform(value) if isinstance(value, dict) else value) for key, value in multilevelDict.items() }
class LectureVideoIndexer: config: Config = { 'frame_step': 2, 'image_similarity_threshold': 0.9, 'text_similarity_threshold': 0.85, 'hash_size': 16, } progress_callback: ProgressCallback __normalized_levenshtein = None def __init__(self, config: Optional[Config] = None, progress_callback: ProgressCallback = lambda stage, progress: None): if config is not None: self.config = {**self.config, **config} self.progress_callback = progress_callback self.__normalized_levenshtein = NormalizedLevenshtein() def index( self, video_path: os.PathLike, skip_converting: bool = False, crop_region: CropRegion = None, toc: TableOfContents = None, ) -> VideoIndex: if not skip_converting: self.__clean() self.__convert_to_frames(video_path, crop_region=crop_region) _, _, frames = next(os.walk(FRAMES_DIR)) filtered_frames = self.__filter_similar_frames(frames_count=len(frames)) processor = BasicFrameProcessor( self.config['text_similarity_threshold']) if toc is None else TOCProcessor( toc, self.config['text_similarity_threshold']) index = self.__process_frames(filtered_frames, processor) return index def __clean(self): dirpath = Path(FRAMES_DIR) if dirpath.exists() and dirpath.is_dir(): shutil.rmtree(dirpath) dirpath.mkdir(parents=True, exist_ok=True) def __convert_to_frames(self, video_path: os.PathLike, crop_region: CropRegion = None): converter = VideoConverter( self.config['frame_step'], progress_callback=lambda progress: self.progress_callback(Stage.CONVERTING, progress)) converter.convert_to_frames(video_path=video_path, crop_region=crop_region) def __filter_similar_frames(self, frames_count: int) -> [int]: filtered_frames: [int] = [0] prev_frame = 0 max_frame = frames_count * self.config['frame_step'] prev_progress = 0 for frame in range(self.config['frame_step'], max_frame, self.config['frame_step']): frame_path = self.__create_frame_path(frame) similarity = self.__compare_images(self.__create_frame_path(prev_frame), frame_path) if (similarity < self.config['image_similarity_threshold']): filtered_frames.append(frame) prev_frame = frame progress = round((frame + 1) / max_frame * 100) if progress > prev_progress: self.progress_callback(Stage.FILTERING, progress) prev_progress = progress return filtered_frames def __compare_images(self, img_path_a: os.PathLike, img_path_b: os.PathLike) -> float: hash_a = imagehash.phash(Image.open(img_path_a), hash_size=self.config['hash_size']) hash_b = imagehash.phash(Image.open(img_path_b), hash_size=self.config['hash_size']) return self.__normalized_levenshtein.similarity(str(hash_a), str(hash_b)) def __process_frames(self, frames: [int], processor: FrameProcessor) -> VideoIndex: index: VideoIndex = [] prev_progress = 0 for i in range(len(frames)): frame = frames[i] frame_path = self.__create_frame_path(frame) image = self.__preprocess_image(frame_path) text = pytesseract.image_to_string(image) title, text = self.__extract_title(text) entry = processor.process_frame(frame, title, text) if entry: index.append(entry) progress = round(((i + 1) * 100) / len(frames)) if progress > prev_progress: self.progress_callback(Stage.PROCESSING, progress) prev_progress = progress return index def __preprocess_image(self, path: os.PathLike): img = cv.imread(path) img = cv.cvtColor(img, cv.COLOR_BGR2GRAY) img = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2) img = cv.medianBlur(img, 3) return img def __extract_title(self, text) -> Tuple[str, str]: lines = [line for line in text.strip().split('\n') if not line.isspace() and len(line) > 1] title = lines[0] if lines else None text = '\n'.join(lines[1:]) return title, text def __create_frame_path(self, frame) -> str: return os.path.join(FRAMES_DIR, f"{FRAME_PREFIX}{frame}.png")
fourgram = NGram(4) print(fourgram.distance(s1, s2)) jarowinkler = JaroWinkler() print(jarowinkler.similarity('My string', 'My tsring')) print(jarowinkler.similarity('My string', 'My ntrisg')) optimal_string_alignment = OptimalStringAlignment() print(optimal_string_alignment.distance('CA', 'ABC')) damerau = Damerau() print(damerau.distance('ABCDEF', 'ABDCEF')) print(damerau.distance('ABCDEF', 'BACDFE')) print(damerau.distance('ABCDEF', 'ABCDE')) print(damerau.distance('ABCDEF', 'BCDEF')) print(damerau.distance('ABCDEF', 'ABCGDEF')) print(damerau.distance('ABCDEF', 'POIU')) normalized_levenshtein = NormalizedLevenshtein() print(normalized_levenshtein.distance('My string', 'My $string')) print(normalized_levenshtein.distance('My string', 'My $string')) print(normalized_levenshtein.distance('My string', 'My $string')) print(normalized_levenshtein.similarity('My string', 'My $string')) print(normalized_levenshtein.similarity('My string', 'My $string')) print(normalized_levenshtein.similarity('My string', 'My $string')) levenshtein = Levenshtein() print(levenshtein.distance('My string', 'My $string')) print(levenshtein.distance('My string', 'My $string')) print(levenshtein.distance('My string', 'My $string'))