def test_file_area(): def open_file(err_msg, files_to_open): assert not err_msg wl_file_area.Wl_Worker_Open_Files( main, dialog_progress=wl_dialogs_misc.Wl_Dialog_Progress(main, text=''), update_gui=update_gui, files_to_open=files_to_open).run() def update_gui(err_msg, new_files): assert not err_msg main.settings_custom['file_area']['files_open'].extend(new_files) # Reset custom settings main.settings_custom = copy.deepcopy(main.settings_default) # Clean cached files for file in glob.glob('imports/*.*'): os.remove(file) for file_path in glob.glob('wl_tests_files/wl_file_area/work_area/*.txt'): time_start = time.time() print(f'Loading file "{os.path.split(file_path)[1]}"... ', end='') table = QObject() table.files_to_open = [] wl_file_area.Wl_Worker_Add_Files( main, dialog_progress=wl_dialogs_misc.Wl_Dialog_Progress(main, text=''), update_gui=open_file, file_paths=[file_path], table=table).run() new_file = main.settings_custom['file_area']['files_open'][-1] assert new_file['selected'] assert new_file['name'] == new_file['name_old'] == os.path.splitext( os.path.split(file_path)[-1])[0] assert new_file['path'] == wl_misc.get_normalized_path( file_path).replace( os.path.join('wl_tests_files', 'wl_file_area', 'work_area'), 'imports') assert new_file['path_original'] == wl_misc.get_normalized_path( file_path) assert new_file['encoding'] == 'utf_8' assert new_file['lang'] == re.search(r'(?<=\[)[a-z_]+(?=\])', file_path).group() assert new_file['tokenized'] == 'No' assert new_file['tagged'] == 'No' print(f'done! (In {round(time.time() - time_start, 2)} seconds)') # Save Settings with open('wl_tests/wl_settings.pickle', 'wb') as f: pickle.dump(main.settings_custom, f)
def check_files_unsupported(main, files): files_unsupported = [] files_ok = [] file_exts = [ ext for file_type in main.settings_global['file_types']['files'] for ext in re.findall(r'(?<=\*)\.[a-z]+', file_type) ] if files: # Wordless files if type(files[0]) == dict: for file in files: if os.path.splitext(file['path'])[1].lower() not in file_exts: files_unsupported.append(file) else: files_ok.append(file) # File paths elif type(files[0]) == str: for file_path in files: file_path = wl_misc.get_normalized_path(file_path) if os.path.splitext(file_path)[1].lower() not in file_exts: files_unsupported.append(file_path) else: files_ok.append(file_path) return files_ok, files_unsupported
def check_file_paths_parsing_error(main, file_paths): file_paths_parsing_error = [] file_paths_pass = [] if file_paths: for file_path in file_paths: file_path = wl_misc.get_normalized_path(file_path) if os.path.splitext(file_path)[1] in [ '.txt', '.csv', '.htm', '.html', '.xml', '.tmx' ]: if main.settings_custom['files']['auto_detection_settings'][ 'detect_encodings']: encoding = wl_detection.detect_encoding(main, file_path) else: encoding = main.settings_custom['auto_detection'][ 'default_settings']['default_encoding'] try: text = '' with open(file_path, 'r', encoding=encoding) as f: for line in f: text += line except: file_paths_parsing_error.append(file_path) else: file_paths_pass.append(file_path) else: file_paths_pass.append(file_path) return file_paths_pass, file_paths_parsing_error
def browse_stop_words(self): path_file = QFileDialog.getExistingDirectory( self, self.tr('Select Folder'), self.settings_custom['stop_words']['default_path'] ) if path_file: self.line_edit_exp_stop_words_default_path.setText(wl_misc.get_normalized_path(path_file))
def browse_temp_files(self): path_file = QFileDialog.getExistingDirectory( self.main, self.tr('Select Folder'), self.settings_custom['temp_files']['default_path'] ) if path_file: self.line_edit_imp_temp_files_default_path.setText(wl_misc.get_normalized_path(path_file))
def __init__(self): super().__init__( QPixmap(wl_misc.get_normalized_path('imgs/wl_loading.png'))) msg_font = QFont('Times New Roman') msg_font.setPixelSize(14) self.setFont(msg_font) self.show_message(self.tr('Initializing Wordless ...'))
def check_files_parsing_error(main, files): files_parsing_error = [] files_ok = [] if files: # Wordless files if type(files[0]) == dict: for file in files: file_path = file['path'] if os.path.splitext(file_path)[1] in [ '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc' ]: try: with open(file_path, 'r', encoding=file['encoding']) as f: for line in f: pass except: files_parsing_error.append(file) else: files_ok.append(file) else: files_ok.append(file) # File paths elif type(files[0]) == str: for file_path in files: file_path = wl_misc.get_normalized_path(file_path) if os.path.splitext(file_path)[1] in [ '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc' ]: if main.settings_custom['files'][ 'auto_detection_settings']['detect_encodings']: encoding, _ = wl_detection.detect_encoding( main, file_path) else: encoding = main.settings_custom['auto_detection'][ 'default_settings']['default_encoding'] try: with open(file_path, 'r', encoding=encoding) as f: for line in f: pass except: files_parsing_error.append(file_path) else: files_ok.append(file_path) else: files_ok.append(file_path) return files_ok, files_parsing_error
def __init__(self, main, title, width = 0, height = 0): super().__init__(main) self.main = main if width: self.setFixedWidth(width) if height: self.setFixedHeight(height) self.setWindowTitle(title) self.setWindowIcon(QIcon(wl_misc.get_normalized_path('imgs/wl_icon.ico'))) self.setWindowFlag(Qt.MSWindowsFixedSizeDialogHint, True) self.setWindowFlag(Qt.WindowContextHelpButtonHint, False)
def check_file_paths_missing(main, file_paths): file_paths_missing = [] file_paths_pass = [] if file_paths: for file_path in file_paths: file_path = wl_misc.get_normalized_path(file_path) if os.path.exists(file_path): file_paths_pass.append(file_path) else: file_paths_missing.append(file_path) return file_paths_pass, file_paths_missing
def check_file_paths_empty(main, file_paths): file_paths_ok = [] file_paths_empty = [] if file_paths: for i, file_path in enumerate(file_paths): file_path = wl_misc.get_normalized_path(file_path) if os.stat(file_path).st_size: file_paths_ok.append(file_path) else: file_paths_empty.append(file_path) return file_paths_ok, file_paths_empty
def __init__(self, main, title, width=0, height=0): super().__init__(main) self.main = main if width: self.setFixedWidth(width) if height: self.setFixedHeight(height) self.setWindowTitle(title) self.setWindowIcon( QIcon(wl_misc.get_normalized_path('imgs/wl_icon.ico'))) # Do not use setWindowFlag, which was added in Qt 5.9 (PyQt 5.8 is used on macOS for compatibility with old macOSes) self.setWindowFlags(self.windowFlags() | Qt.MSWindowsFixedSizeDialogHint) self.setWindowFlags(self.windowFlags() & ~Qt.WindowContextHelpButtonHint)
def check_file_paths_unsupported(main, file_paths): file_paths_unsupported = [] file_paths_pass = [] file_exts = [ ext for file_type in main.settings_global['file_types']['files'] for ext in re.findall(r'(?<=\*)\.[a-z]+', file_type) ] if file_paths: for file_path in file_paths: file_path = wl_misc.get_normalized_path(file_path) if os.path.splitext(file_path)[1].lower() not in file_exts: file_paths_unsupported.append(file_path) else: file_paths_pass.append(file_path) return file_paths_pass, file_paths_unsupported
def check_file_paths_empty(main, file_paths): file_paths_empty = [] file_paths_pass = [] if file_paths: for file_path in file_paths: file_path = wl_misc.get_normalized_path(file_path) # Text files if os.path.splitext(file_path)[1] in [ '.txt', '.csv', '.htm', '.html', '.xml', '.tmx' ]: if main.settings_custom['files']['auto_detection_settings'][ 'detect_encodings']: encoding = wl_detection.detect_encoding(main, file_path) else: encoding = main.settings_custom['auto_detection'][ 'default_settings']['default_encoding'] try: with open(file_path, 'r', encoding=encoding) as f: empty_file = True for line in f: if line.strip(): empty_file = False break if empty_file: file_paths_empty.append(file_path) else: file_paths_pass.append(file_path) except: file_paths_pass.append(file_path) # Other file types else: if os.stat(file_path).st_size: file_paths_pass.append(file_path) else: file_paths_empty.append(file_path) return file_paths_pass, file_paths_empty
def check_files_missing(main, files): files_missing = [] files_ok = [] if files: # Wordless files if type(files[0]) == dict: for file in files: if os.path.exists(file['path']): files_ok.append(file) else: files_missing.append(file) # File paths elif type(files[0]) == str: for file_path in files: file_path = wl_misc.get_normalized_path(file_path) if os.path.exists(file_path): files_ok.append(file_path) else: files_missing.append(file_path) return files_ok, files_missing
def check_files_duplicate(main, files): files_duplicate = [] files_ok = [] if files: # Wordless files if type(files[0]) == dict: for file in files: if main.wl_files.find_file_by_path(file['path']): files_duplicate.append(file) else: files_ok.append(file) # File paths elif type(files[0]) == str: for file_path in files: file_path = wl_misc.get_normalized_path(file_path) if main.wl_files.find_file_by_path(file_path): files_duplicate.append(file_path) else: files_ok.append(file_path) return files_ok, files_duplicate
def restart(self): if getattr(sys, '_MEIPASS', False): if platform.system() == 'Windows': subprocess.Popen([wl_misc.get_normalized_path('Wordless.exe')]) elif platform.system() == 'Darwin': subprocess.Popen([wl_misc.get_normalized_path('Wordless')]) elif platform.system() == 'Linux': subprocess.Popen([wl_misc.get_normalized_path('Wordless')]) else: if platform.system() == 'Windows': subprocess.Popen( ['python', wl_misc.get_normalized_path(__file__)]) elif platform.system() == 'Darwin': subprocess.Popen( ['python3', wl_misc.get_normalized_path(__file__)]) elif platform.system() == 'Linux': subprocess.Popen( ['python3.8', wl_misc.get_normalized_path(__file__)]) self.save_settings() sys.exit(0)
def run(self): new_files = [] if self.file_paths: len_file_paths = len(self.file_paths) for i, file_path in enumerate(self.file_paths): self.progress_updated.emit( self.tr(f'Opening files ... ({i + 1}/{len_file_paths})')) default_dir = wl_checking_misc.check_dir( self.main.settings_custom['import']['temp_files'] ['default_path']) default_encoding = self.main.settings_custom['import'][ 'temp_files']['default_encoding'] file_path = wl_misc.get_normalized_path(file_path) file_name, file_ext = os.path.splitext( os.path.basename(file_path)) file_ext = file_ext.lower() # Text files if file_ext == '.txt': new_files.append(self.main.wl_files._new_file(file_path)) else: if file_ext in ['.docx', '.xlsx', '.xls']: new_path = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}.txt')) # Word documents if file_ext == '.docx': lines = [] with open(new_path, 'w', encoding=default_encoding) as f: doc = docx.Document(file_path) for block in self.iter_block_items(doc): if type(block ) == docx.text.paragraph.Paragraph: f.write(f'{block.text}\n') elif type(block) == docx.table.Table: for row in self.iter_visual_cells( block): cells = [] for cell in row: cells.append(' '.join([ item.text for item in self.iter_cell_items(cell) ])) f.write('\t'.join(cells) + '\n') # Excel workbooks elif file_ext == '.xlsx': with open(new_path, 'w', encoding=default_encoding) as f: workbook = openpyxl.load_workbook( file_path, data_only=True) for worksheet_name in workbook.sheetnames: worksheet = workbook[worksheet_name] for row in worksheet.rows: f.write('\t'.join([( cell.value if cell.value != None else '') for cell in row]) + '\n') new_paths = [new_path] else: # Detect encoding if self.main.settings_custom['files'][ 'auto_detection_settings']['detect_encodings']: encoding_code, _ = wl_detection.detect_encoding( self.main, file_path) else: encoding_code = self.main.settings_custom[ 'auto_detection']['default_settings'][ 'default_encoding'] # CSV files if file_ext == '.csv': new_path = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}.txt')) with open(new_path, 'w', encoding=default_encoding) as f: with open(file_path, 'r', newline='', encoding=encoding_code) as f_csv: csv_reader = csv.reader(f_csv) for row in csv_reader: f.write('\t'.join(row) + '\n') new_paths = [new_path] # HTML files elif file_ext in ['.htm', '.html']: with open(file_path, 'r', encoding=encoding_code) as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') new_path = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}.txt')) with open(new_path, 'w', encoding=default_encoding) as f: f.write(soup.get_text()) new_paths = [new_path] # XML files elif file_ext == '.xml': with open(file_path, 'r', encoding=encoding_code) as f: xml_text = f.read() new_path = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}.xml')) with open(new_path, 'w', encoding=default_encoding) as f: f.write(xml_text) new_paths = [new_path] # Translation memory files elif file_ext == '.tmx': lines_src = [] lines_target = [] with open(file_path, 'r', encoding=encoding_code) as f: soup = bs4.BeautifulSoup(f.read(), 'lxml-xml') for tu in soup.find_all('tu'): seg_src, seg_target = tu.find_all('seg') lines_src.append(seg_src.get_text()) lines_target.append(seg_target.get_text()) path_src = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}_source.txt')) path_target = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}_target.txt')) with open(path_src, 'w', encoding=default_encoding) as f: f.write('\n'.join(lines_src)) f.write('\n') with open(path_target, 'w', encoding=default_encoding) as f: f.write('\n'.join(lines_target)) f.write('\n') new_paths = [path_src, path_target] for new_path in new_paths: new_files.append( self.main.wl_files._new_file(new_path, txt=False)) self.main.settings_custom['import']['files'][ 'default_path'] = wl_misc.get_normalized_dir( self.file_paths[0]) self.progress_updated.emit(self.tr('Updating table ...')) time.sleep(0.1) self.worker_done.emit(new_files)
def check_files_empty(main, files): files_empty = [] files_ok = [] if files: # Wordless files if type(files[0]) == dict: for file in files: file_path = file['path'] # Text files if os.path.splitext(file_path)[1] in [ '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc' ]: try: with open(file_path, 'r', encoding=file['encoding']) as f: empty_file = True for line in f: if line.strip(): empty_file = False break if empty_file: files_empty.append(file) else: files_ok.append(file) except: files_ok.append(file) # Other file types else: if os.stat(file_path).st_size: files_ok.append(file) else: files_empty.append(file) # File paths elif type(files[0]) == str: for file_path in files: file_path = wl_misc.get_normalized_path(file_path) # Text files if os.path.splitext(file_path)[1] in [ '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc' ]: if main.settings_custom['files'][ 'auto_detection_settings']['detect_encodings']: encoding, _ = wl_detection.detect_encoding( main, file_path) else: encoding = main.settings_custom['auto_detection'][ 'default_settings']['default_encoding'] try: with open(file_path, 'r', encoding=encoding) as f: empty_file = True for line in f: if line.strip(): empty_file = False break if empty_file: files_empty.append(file_path) else: files_ok.append(file_path) except: files_ok.append(file_path) # Other file types else: if os.stat(file_path).st_size: files_ok.append(file_path) else: files_empty.append(file_path) return files_ok, files_empty
def __init__(self, loading_window): super().__init__() self.loading_window = loading_window self.threads_check_updates = [] # Version numbers self.ver = wl_misc.get_wl_ver() self.ver_major, self.ver_minor, self.ver_patch = wl_misc.split_wl_ver( self.ver) # Title self.setWindowTitle(self.tr(f'Wordless')) # Icon self.setWindowIcon( QIcon(wl_misc.get_normalized_path('imgs/wl_icon.ico'))) self.loading_window.show_message(self.tr('Loading settings ...')) # Default settings wl_settings_default.init_settings_default(self) # Custom settings path_settings = wl_misc.get_normalized_path('wl_settings.pickle') if os.path.exists(path_settings): with open(path_settings, 'rb') as f: settings_custom = pickle.load(f) if wl_checking_misc.check_custom_settings(settings_custom, self.settings_default): self.settings_custom = settings_custom else: self.settings_custom = copy.deepcopy(self.settings_default) else: self.settings_custom = copy.deepcopy(self.settings_default) # Global settings wl_settings_global.init_settings_global(self) # Settings self.wl_settings = wl_settings.Wl_Settings(self) self.loading_window.show_message( self.tr('Initializing main window ...')) # Menu self.init_menu() # Work Area & File Area self.init_central_widget() # Status Bar self.statusBar().showMessage(self.tr('Ready!')) self.statusBar().setFixedHeight(22) self.statusBar().setStyleSheet(''' QStatusBar { background-color: #D0D0D0; } ''') # Check for updates on startup self.loading_window.show_message(self.tr('Check for updates ...')) if self.settings_custom['general']['update_settings'][ 'check_updates_on_startup']: self.dialog_check_updates = self.help_check_updates( on_startup=True) self.loading_window.show_message(self.tr('Starting Wordless ...')) self.load_settings() # Fix layout on macOS if platform.system() == 'Darwin': self.fix_macos_layout(self)
def import_list(self, settings): files = [] if os.path.exists( self.main.settings_custom['import'][settings]['default_path']): default_dir = self.main.settings_custom['import'][settings][ 'default_path'] else: default_dir = self.main.settings_default['import'][settings][ 'default_path'] file_paths = QFileDialog.getOpenFileNames( self.main, self.tr('Import from File(s)'), default_dir, self.tr('Text File (*.txt)'))[0] if file_paths: self.main.settings_custom['import'][settings][ 'default_path'] = os.path.normpath( os.path.dirname(file_paths[0])) # Detect encodings if self.main.settings_custom['import'][settings][ 'detect_encodings']: for file_path in file_paths: files.append({ 'path': wl_misc.get_normalized_path(file_path), 'encoding': wl_detection.detect_encoding(self.main, file_path)[0] }) else: for file_path in file_paths: files.append({ 'path': wl_misc.get_normalized_path(file_path), 'encoding': self.main.settings_custom['auto_detection'] ['default_settings']['default_encoding'] }) files_ok, files_empty = wl_checking_file.check_files_empty( self.main, files) files_ok, files_decoding_error = wl_checking_file.check_files_decoding_error( self.main, files_ok) # Extract file paths files_empty = [file['path'] for file in files_empty] files_decoding_error = [ file['path'] for file in files_decoding_error ] if files_empty or files_decoding_error: wl_dialog_error.wl_dialog_error_import( self.main, files_empty=files_empty, files_decoding_error=files_decoding_error) wl_msg.wl_msg_import_list_error(self.main) else: # Check duplicate items items_to_import = [] items_cur = self.get_items() num_prev = len(items_cur) for file in files_ok: with open(file['path'], 'r', encoding=file['encoding']) as f: for line in f: line = line.strip() if line not in items_cur: items_to_import.append(line) self.load_items( collections.OrderedDict.fromkeys(items_to_import)) self.itemChanged.emit(self.item(0)) wl_msg.wl_msg_import_list_success(self.main, num_prev, len(self.get_items()))
def wl_get_stop_word_list(main, lang, stop_word_list = 'default'): if stop_word_list == 'default': stop_word_list = main.settings_custom['stop_word_lists']['stop_word_lists'][lang] if stop_word_list == main.tr('Custom List'): stop_word_list = main.settings_custom['stop_word_lists']['custom_lists'][lang] else: lang_639_1 = wl_conversion.to_iso_639_1(main, lang) # Chinese (Simplified) if lang_639_1 == 'zh_cn': lang_639_1 = 'zh' # Chinese (Traditional) if lang_639_1 == 'zh_tw': cc = opencc.OpenCC('s2tw') stop_word_list_zho_cn = wl_get_stop_word_list( main, lang = 'zho_cn', stop_word_list = stop_word_list.replace('Chinese (Traditional)', 'Chinese (Simplified)')) stop_word_list = [cc.convert(stop_word) for stop_word in stop_word_list_zho_cn] # extra-stopwords elif 'extra-stopwords' in stop_word_list: LANG_TEXTS = { 'sqi': 'albanian', 'ara': 'arabic', 'hye': 'armenian', 'eus': 'basque', 'bel': 'belarusian', 'ben': 'bengali', 'bul': 'bulgarian', 'cat': 'catalan', 'zho_cn': 'chinese', 'hrv': 'croatian', 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', 'glg': 'galician', 'deu': 'german', 'ell': 'greek', 'hau': 'hausa', 'heb': 'hebrew', 'hin': 'hindi', 'hun': 'hungarian', 'isl': 'icelandic', 'ind': 'indonesian', 'gle': 'irish', 'ita': 'italian', 'jpn': 'japanese', 'kor': 'korean', 'kur': 'kurdish', 'lav': 'latvian', 'lit': 'lithuanian', 'msa': 'malay', 'mar': 'marathi', 'mon': 'mongolian', 'nep': 'nepali', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'fas': 'persian', 'pol': 'polish', 'por': 'portuguese', 'ron': 'romanian', 'rus': 'russian', 'srp_cyrl': 'serbian-cyrillic', 'srp_latn': 'serbian', 'slk': 'slovak', 'slv': 'slovenian', 'spa': 'spanish', 'swa': 'swahili', 'swe': 'swedish', 'tgl': 'tagalog', 'tel': 'telugu', 'tha': 'thai', 'tur': 'turkish', 'ukr': 'ukranian', 'urd': 'urdu', 'vie': 'vietnamese', 'yor': 'yoruba' } with open(wl_misc.get_normalized_path(f'stop_word_lists/extra-stopwords/{LANG_TEXTS[lang]}'), 'r', encoding = 'utf_8') as f: stop_word_list = [line.rstrip() for line in f if not line.startswith('#')] # NLTK elif 'NLTK' in stop_word_list: LANG_TEXTS = { 'ara': 'arabic', 'aze': 'azerbaijani', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'fin': 'finnish', 'fra': 'french', 'deu': 'german', 'ell': 'greek', 'hun': 'hungarian', 'ind': 'indonesian', 'ita': 'italian', 'kaz': 'kazakh', 'nep': 'nepali', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'por': 'portuguese', 'ron': 'romanian', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tgk': 'tajik', 'tur': 'turkish' } stop_word_list = nltk.corpus.stopwords.words(LANG_TEXTS[lang]) # spaCy elif 'spaCy' in stop_word_list: # Serbian (Cyrillic) & Serbian (Latin) if lang_639_1 == 'sr_cyrl': spacy_lang = importlib.import_module('spacy.lang.sr') stop_word_list = spacy_lang.STOP_WORDS elif lang_639_1 == 'sr_latn': spacy_lang = importlib.import_module('spacy.lang.sr') stop_word_list = spacy_lang.STOP_WORDS stop_word_list = wl_text_utils.to_srp_latn(stop_word_list) else: spacy_lang = importlib.import_module(f'spacy.lang.{lang_639_1}') stop_word_list = spacy_lang.STOP_WORDS # Stopwords ISO elif 'Stopwords ISO' in stop_word_list: # Greek (Ancient) if lang_639_1 == 'grc': lang_639_1 = 'el' # Norwegian Bokmål & Norwegian Nynorsk if lang_639_1 in ['nb', 'nn']: lang_639_1 = 'no' with open(wl_misc.get_normalized_path('stop_word_lists/Stopwords ISO/stopwords_iso.json'), 'r', encoding = 'utf_8') as f: stop_word_list = json.load(f)[lang_639_1] # Thai elif stop_word_list == main.tr('PyThaiNLP - Thai Stop Word List'): stop_word_list = pythainlp.corpus.common.thai_stopwords() # Remove empty tokens stop_word_list = [stop_word for stop_word in stop_word_list if stop_word] return sorted(set(stop_word_list))
def wl_lemmatize(main, tokens, lang, text_type = ('untokenized', 'untagged'), lemmatizer = 'default'): empty_offsets = [] mapping_lemmas = {} lemmas = [] tokens = [str(token) for token in tokens] re_tags_all = wl_matching.get_re_tags(main, tags = 'all') re_tags_pos = wl_matching.get_re_tags(main, tags = 'pos') re_tags_non_pos = wl_matching.get_re_tags(main, tags = 'non_pos') if text_type[1] == 'tagged_both': tags = [''.join(re.findall(re_tags_all, token)) for token in tokens] tokens = [re.sub(re_tags_all, '', token) for token in tokens] elif text_type[1] == 'tagged_pos': tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens] tokens = [re.sub(re_tags_pos, '', token) for token in tokens] elif text_type[1] == 'tagged_non_pos': tags = [''.join(re.findall(re_tags_non_pos, token)) for token in tokens] tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens] else: tags = [''] * len(tokens) # Record empty tokens for i, token in reversed(list(enumerate(tokens))): if not token.strip(): empty_offsets.append(i) tokens.remove(token) wl_text_utils.check_lemmatizers(main, lang) if tokens and lang in main.settings_global['lemmatizers']: if lemmatizer == 'default': lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][lang] # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish if 'spaCy' in lemmatizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = spacy.tokens.Doc(nlp.vocab, words = tokens) nlp.tagger(doc) lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'): word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wl_pos_tagging.wl_pos_tag( main, tokens, lang = 'eng', pos_tagger = 'NLTK - Perceptron POS Tagger', tagset = 'universal' ): if pos == 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) # Greek (Ancient) elif lemmatizer == main.tr('lemmalist-greek - Greek (Ancient) Lemma List'): with open(wl_misc.get_normalized_path('lemmatization/lemmalist-greek/lemmalist-greek.txt'), 'r', encoding = 'utf_8') as f: for line in f.readlines(): line = line.rstrip() if line: lemma, *words = line.split() for word in words: mapping_lemmas[word] = lemma # Russian & Ukrainian elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'): if lang == 'rus': morphological_analyzer = pymorphy2.MorphAnalyzer(lang = 'ru') else: morphological_analyzer = pymorphy2.MorphAnalyzer(lang = 'uk') for token in tokens: lemmas.append(morphological_analyzer.parse(token)[0].normal_form) # Tibetan elif lemmatizer == main.tr('botok - Tibetan Lemmatizer'): wl_text_utils.check_word_tokenizers(main, lang = 'bod') tokens = main.botok_word_tokenizer.tokenize(' '.join(tokens)) for token in tokens: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) # Other Languages elif 'Lemmatization Lists' in lemmatizer: lang = wl_conversion.to_iso_639_1(main, lang) with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except: pass else: lemmas = tokens if mapping_lemmas: lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Insert empty lemmas for empty_offset in sorted(empty_offsets): lemmas.insert(empty_offset, '') return [lemma + tag for lemma, tag in zip(lemmas, tags)]
def wl_lemmatize_tokens(main, tokens, lang, tokenized, tagged, lemmatizer): empty_offsets = [] lemmas = [] tokens = [str(token) for token in tokens] re_tags = wl_matching.get_re_tags(main, tag_type = 'body') if tagged == _tr('wl_lemmatize_tokens', 'Yes'): tags = [''.join(re.findall(re_tags, token)) for token in tokens] tokens = [re.sub(re_tags, '', token) for token in tokens] else: tags = [''] * len(tokens) # Record empty tokens with their tags for i, token in reversed(list(enumerate(tokens))): if not token.strip(): empty_offsets.append(i) del tokens[i] del tags[i] # spaCy if 'spacy' in lemmatizer: if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] if lang != 'jpn': doc = spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [False] * len(tokens)) for pipe_name in nlp.pipe_names: nlp.get_pipe(pipe_name)(doc) # The Japanese model do not have a lemmatizer component and Japanese lemmas are taken directly from SudachiPy # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1923647 else: doc = nlp(''.join(tokens)) lemma_tokens = [token.text for token in doc] lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wl_pos_tagging.wl_pos_tag( main, tokens, lang = 'eng_us', pos_tagger = 'nltk_perceptron', tagset = 'universal' ): if pos == 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) lemma_tokens = tokens.copy() # Japanese elif lemmatizer == 'sudachipy_jpn': tokens_retokenized = main.sudachipy_word_tokenizer.tokenize(''.join(tokens)) lemma_tokens = [token.surface() for token in tokens_retokenized] lemmas = [token.dictionary_form() for token in tokens_retokenized] # Russian & Ukrainian elif lemmatizer == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr for token in tokens: lemmas.append(morphological_analyzer.parse(token)[0].normal_form) lemma_tokens = tokens.copy() # Tibetan elif lemmatizer == 'botok_bod': lemma_tokens = [] tokens_retokenized = main.botok_word_tokenizer.tokenize(''.join(tokens)) for token in tokens_retokenized: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) lemma_tokens.append(token.text) # Lemmatization Lists elif 'lemmatization_lists' in lemmatizer: mapping_lemmas = {} lang = wl_conversion.to_iso_639_1(main, lang) lang = wl_conversion.remove_lang_code_suffixes(main, lang) with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except ValueError: pass lemma_tokens = tokens.copy() lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Remove empty lemmas and strip whitespace in tokens for i, lemma in reversed(list(enumerate(lemmas))): lemma_tokens[i] = lemma_tokens[i].strip() lemmas[i] = lemma.strip() if not lemmas[i]: del lemmas[i] del lemma_tokens[i] # Make sure that tokenization is not modified during lemmatization i_tokens = 0 i_lemmas = 0 len_tokens = len(tokens) len_lemmas = len(lemmas) if len_tokens != len_lemmas: tags_modified = [] lemmas_modified = [] while i_tokens < len_tokens and i_lemmas < len_lemmas: # Different token if len(tokens[i_tokens]) != len(lemma_tokens[i_lemmas]): tokens_temp = [tokens[i_tokens]] tags_temp = [tags[i_tokens]] lemma_tokens_temp = [lemma_tokens[i_lemmas]] lemmas_temp = [lemmas[i_lemmas]] # Align tokens while i_tokens < len_tokens - 1 or i_lemmas < len_lemmas - 1: len_tokens_temp = sum([len(token) for token in tokens_temp]) len_lemma_tokens_temp = sum([len(token) for token in lemma_tokens_temp]) if len_tokens_temp > len_lemma_tokens_temp: lemma_tokens_temp.append(lemma_tokens[i_lemmas + 1]) lemmas_temp.append(lemmas[i_lemmas + 1]) i_lemmas += 1 elif len_tokens_temp < len_lemma_tokens_temp: tokens_temp.append(tokens[i_tokens + 1]) tags_temp.append(tags[i_tokens + 1]) i_tokens += 1 else: # Use lemmas in one-to-one if len(tokens_temp) == len(lemma_tokens_temp): tags_modified.extend(tags_temp) lemmas_modified.extend(lemmas_temp) # Use original tokens in many-to-one or one-to-many else: tags_modified.extend(tags) lemmas_modified.extend(tokens_temp) tokens_temp = [] tags_temp = [] lemma_tokens_temp = [] lemmas_temp = [] break if tokens_temp: # Use lemmas in one-to-one if len(tokens_temp) == len(lemma_tokens_temp): tags_modified.extend(tags_temp) lemmas_modified.extend(lemmas_temp) # Use original tokens in many-to-one or one-to-many else: tags_modified.extend(tags) lemmas_modified.extend(tokens_temp) else: tags_modified.extend(tags[i_tokens]) lemmas_modified.append(lemmas[i_lemmas]) i_tokens += 1 i_lemmas += 1 len_lemmas_modified = len(lemmas_modified) if len_tokens < len_lemmas_modified: tags = tags_modified[:len_tokens] lemmas = lemmas_modified[:len_tokens] elif len_tokens > len_lemmas_modified: tags = tags_modified + [tags_modified[-1]] * (len_tokens - len_lemmas_modified) lemmas = lemmas_modified + [lemmas_modified[-1]] * (len_tokens - len_lemmas_modified) else: tags = tags_modified.copy() lemmas = lemmas_modified.copy() # Insert empty lemmas and their tags after alignment of input and output for empty_offset in sorted(empty_offsets): lemmas.insert(empty_offset, '') tags.insert(empty_offset, '') return [lemma + tag for lemma, tag in zip(lemmas, tags)]
def run(self): new_files = [] files_detection_error_encoding = [] files_detection_error_text_type = [] files_detection_error_lang = [] if self.file_paths: len_file_paths = len(self.file_paths) for i, file_path in enumerate(self.file_paths): self.progress_updated.emit( self.tr(f'Opening files ... ({i + 1}/{len_file_paths})')) default_dir = wl_checking_misc.check_dir( self.main.settings_custom['import']['temp_files'] ['default_path']) default_encoding = self.main.settings_custom['import'][ 'temp_files']['default_encoding'] file_path = wl_misc.get_normalized_path(file_path) file_name, file_ext = os.path.splitext( os.path.basename(file_path)) file_ext = file_ext.lower() # Text files if file_ext == '.txt': (new_file, detection_success_encoding, detection_success_text_type, detection_success_lang ) = self.main.wl_files._new_file(file_path) new_files.append(new_file) if not detection_success_encoding: files_detection_error_encoding.append(new_file['path']) if not detection_success_text_type: files_detection_error_text_type.append( new_file['path']) if not detection_success_lang: files_detection_error_lang.append(new_file['path']) else: if file_ext in ['.docx', '.xlsx', '.xls']: new_path = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}.txt')) # Word documents if file_ext == '.docx': lines = [] with open(new_path, 'w', encoding=default_encoding) as f: doc = docx.Document(file_path) for block in self.iter_block_items(doc): if type(block ) == docx.text.paragraph.Paragraph: f.write(f'{block.text}\n') elif type(block) == docx.table.Table: for row in self.iter_visual_cells( block): cells = [] for cell in row: cells.append(' '.join([ item.text for item in self.iter_cell_items(cell) ])) f.write('\t'.join(cells) + '\n') # Excel workbooks elif file_ext == '.xlsx': with open(new_path, 'w', encoding=default_encoding) as f: workbook = openpyxl.load_workbook( file_path, data_only=True) for worksheet_name in workbook.sheetnames: worksheet = workbook[worksheet_name] for row in worksheet.rows: f.write('\t'.join([( cell.value if cell.value != None else '') for cell in row]) + '\n') elif file_ext == '.xls': with open(new_path, 'w', encoding=default_encoding) as f: workbook = xlrd.open_workbook(file_path) for i_sheet in range(workbook.nsheets): worksheet = workbook.sheet_by_index( i_sheet) for row in range(worksheet.nrows): f.write('\t'.join([ worksheet.cell_value(row, col) for col in range(worksheet.ncols) ]) + '\n') new_paths = [new_path] else: # Detect encoding if self.main.settings_custom['files'][ 'auto_detection_settings']['detect_encodings']: encoding_code, _ = wl_detection.detect_encoding( self.main, file_path) else: encoding_code = self.main.settings_custom[ 'auto_detection']['default_settings'][ 'default_encoding'] # CSV files if file_ext == '.csv': new_path = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}.txt')) with open(new_path, 'w', encoding=default_encoding) as f: with open(file_path, 'r', newline='', encoding=encoding_code) as f_csv: csv_reader = csv.reader(f_csv) for row in csv_reader: f.write('\t'.join(row) + '\n') new_paths = [new_path] # HTML files elif file_ext in ['.htm', '.html']: with open(file_path, 'r', encoding=encoding_code) as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') new_path = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}.txt')) with open(new_path, 'w', encoding=default_encoding) as f: f.write(soup.get_text()) new_paths = [new_path] # XML files elif file_ext == '.xml': with open(file_path, 'r', encoding=encoding_code) as f: xml_text = f.read() new_path = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}.txt')) with open(new_path, 'w', encoding=default_encoding) as f: f.write(xml_text) new_paths = [new_path] # Translation memory files elif file_ext == '.tmx': lines_src = [] lines_target = [] with open(file_path, 'r', encoding=encoding_code) as f: soup = bs4.BeautifulSoup(f.read(), 'lxml-xml') for tu in soup.find_all('tu'): seg_src, seg_target = tu.find_all('seg') lines_src.append(seg_src.get_text()) lines_target.append(seg_target.get_text()) path_src = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}_source.txt')) path_target = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}_target.txt')) with open(path_src, 'w', encoding=default_encoding) as f: f.write('\n'.join(lines_src)) f.write('\n') with open(path_target, 'w', encoding=default_encoding) as f: f.write('\n'.join(lines_target)) f.write('\n') new_paths = [path_src, path_target] # Lyrics files elif file_ext == '.lrc': lyrics = {} with open(file_path, 'r', encoding=encoding_code) as f: for line in f: time_tags = [] line = line.strip() # Strip time tags while re.search(r'^\[[^\]]+?\]', line): time_tags.append( re.search(r'^\[[^\]]+?\]', line).group()) line = line[len(time_tags[-1]):].strip( ) # Strip word time tags line = re.sub(r'<[^>]+?>', r'', line) line = re.sub(r'\s{2,}', r' ', line).strip() for time_tag in time_tags: if re.search( r'^\[[0-9]{2}:[0-5][0-9]\.[0-9]{2}\]$', time_tag): lyrics[time_tag] = line new_path = wl_checking_misc.check_new_path( os.path.join(default_dir, f'{file_name}.txt')) with open(new_path, 'w', encoding=default_encoding) as f: for _, lyrics in sorted(lyrics.items()): f.write(f'{lyrics}\n') new_paths = [new_path] for new_path in new_paths: (new_file, detection_success_encoding, detection_success_text_type, detection_success_lang ) = self.main.wl_files._new_file(new_path) new_files.append(new_file) if not detection_success_encoding: files_detection_error_encoding.append( new_file['path']) if not detection_success_text_type: files_detection_error_text_type.append( new_file['path']) if not detection_success_lang: files_detection_error_lang.append(new_file['path']) self.main.settings_custom['import']['files'][ 'default_path'] = wl_misc.get_normalized_dir( self.file_paths[0]) self.progress_updated.emit(self.tr('Updating table ...')) time.sleep(0.1) self.worker_done.emit(new_files, files_detection_error_encoding, files_detection_error_text_type, files_detection_error_lang)
def wl_lemmatize_text(main, text, lang, tokenized, tagged, lemmatizer): lemmas = [] # spaCy if lemmatizer.startswith('spacy_'): if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wl_pos_tagging.wl_pos_tag( main, text, lang = 'eng_us', pos_tagger = 'nltk_perceptron', tagset = 'universal' ): if pos == 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) # Japanese elif lemmatizer == 'sudachipy_jpn': lemmas = [ token.dictionary_form() for token in main.sudachipy_word_tokenizer.tokenize(text) ] # Russian & Ukrainian elif lemmatizer == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) for token in tokens: lemmas.append(morphological_analyzer.parse(token)[0].normal_form) # Tibetan elif lemmatizer == 'botok_bod': tokens = main.botok_word_tokenizer.tokenize(text) for token in tokens: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) # Lemmatization Lists elif lemmatizer.startswith('lemmatization_lists_'): mapping_lemmas = {} lang = wl_conversion.to_iso_639_1(main, lang) lang = wl_conversion.remove_lang_code_suffixes(main, lang) with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except ValueError: pass tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Remove empty lemmas and strip whitespace in tokens lemmas = [ str(lemma).strip() for lemma in lemmas if str(lemma).strip() ] return lemmas
def get_path(file_name): return wl_misc.get_normalized_path( f'wl_tests/files/wl_checking/wl_checking_file/{file_name}')