def test_file_area():
    def open_file(err_msg, files_to_open):
        assert not err_msg

        wl_file_area.Wl_Worker_Open_Files(
            main,
            dialog_progress=wl_dialogs_misc.Wl_Dialog_Progress(main, text=''),
            update_gui=update_gui,
            files_to_open=files_to_open).run()

    def update_gui(err_msg, new_files):
        assert not err_msg

        main.settings_custom['file_area']['files_open'].extend(new_files)

    # Reset custom settings
    main.settings_custom = copy.deepcopy(main.settings_default)

    # Clean cached files
    for file in glob.glob('imports/*.*'):
        os.remove(file)

    for file_path in glob.glob('wl_tests_files/wl_file_area/work_area/*.txt'):
        time_start = time.time()

        print(f'Loading file "{os.path.split(file_path)[1]}"... ', end='')

        table = QObject()
        table.files_to_open = []

        wl_file_area.Wl_Worker_Add_Files(
            main,
            dialog_progress=wl_dialogs_misc.Wl_Dialog_Progress(main, text=''),
            update_gui=open_file,
            file_paths=[file_path],
            table=table).run()

        new_file = main.settings_custom['file_area']['files_open'][-1]

        assert new_file['selected']
        assert new_file['name'] == new_file['name_old'] == os.path.splitext(
            os.path.split(file_path)[-1])[0]
        assert new_file['path'] == wl_misc.get_normalized_path(
            file_path).replace(
                os.path.join('wl_tests_files', 'wl_file_area', 'work_area'),
                'imports')
        assert new_file['path_original'] == wl_misc.get_normalized_path(
            file_path)
        assert new_file['encoding'] == 'utf_8'
        assert new_file['lang'] == re.search(r'(?<=\[)[a-z_]+(?=\])',
                                             file_path).group()
        assert new_file['tokenized'] == 'No'
        assert new_file['tagged'] == 'No'

        print(f'done! (In {round(time.time() - time_start, 2)} seconds)')

    # Save Settings
    with open('wl_tests/wl_settings.pickle', 'wb') as f:
        pickle.dump(main.settings_custom, f)
Пример #2
0
def check_files_unsupported(main, files):
    files_unsupported = []
    files_ok = []

    file_exts = [
        ext for file_type in main.settings_global['file_types']['files']
        for ext in re.findall(r'(?<=\*)\.[a-z]+', file_type)
    ]

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                if os.path.splitext(file['path'])[1].lower() not in file_exts:
                    files_unsupported.append(file)
                else:
                    files_ok.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wl_misc.get_normalized_path(file_path)

                if os.path.splitext(file_path)[1].lower() not in file_exts:
                    files_unsupported.append(file_path)
                else:
                    files_ok.append(file_path)

    return files_ok, files_unsupported
Пример #3
0
def check_file_paths_parsing_error(main, file_paths):
    file_paths_parsing_error = []
    file_paths_pass = []

    if file_paths:
        for file_path in file_paths:
            file_path = wl_misc.get_normalized_path(file_path)

            if os.path.splitext(file_path)[1] in [
                    '.txt', '.csv', '.htm', '.html', '.xml', '.tmx'
            ]:

                if main.settings_custom['files']['auto_detection_settings'][
                        'detect_encodings']:
                    encoding = wl_detection.detect_encoding(main, file_path)
                else:
                    encoding = main.settings_custom['auto_detection'][
                        'default_settings']['default_encoding']

                try:
                    text = ''

                    with open(file_path, 'r', encoding=encoding) as f:
                        for line in f:
                            text += line
                except:
                    file_paths_parsing_error.append(file_path)
                else:
                    file_paths_pass.append(file_path)
            else:
                file_paths_pass.append(file_path)

    return file_paths_pass, file_paths_parsing_error
Пример #4
0
    def browse_stop_words(self):
        path_file = QFileDialog.getExistingDirectory(
            self,
            self.tr('Select Folder'),
            self.settings_custom['stop_words']['default_path']
        )

        if path_file:
            self.line_edit_exp_stop_words_default_path.setText(wl_misc.get_normalized_path(path_file))
Пример #5
0
    def browse_temp_files(self):
        path_file = QFileDialog.getExistingDirectory(
            self.main,
            self.tr('Select Folder'),
            self.settings_custom['temp_files']['default_path']
        )

        if path_file:
            self.line_edit_imp_temp_files_default_path.setText(wl_misc.get_normalized_path(path_file))
Пример #6
0
    def __init__(self):
        super().__init__(
            QPixmap(wl_misc.get_normalized_path('imgs/wl_loading.png')))

        msg_font = QFont('Times New Roman')
        msg_font.setPixelSize(14)

        self.setFont(msg_font)
        self.show_message(self.tr('Initializing Wordless ...'))
Пример #7
0
def check_files_parsing_error(main, files):
    files_parsing_error = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                file_path = file['path']

                if os.path.splitext(file_path)[1] in [
                        '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    try:
                        with open(file_path, 'r',
                                  encoding=file['encoding']) as f:
                            for line in f:
                                pass
                    except:
                        files_parsing_error.append(file)
                    else:
                        files_ok.append(file)
                else:
                    files_ok.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wl_misc.get_normalized_path(file_path)

                if os.path.splitext(file_path)[1] in [
                        '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    if main.settings_custom['files'][
                            'auto_detection_settings']['detect_encodings']:
                        encoding, _ = wl_detection.detect_encoding(
                            main, file_path)
                    else:
                        encoding = main.settings_custom['auto_detection'][
                            'default_settings']['default_encoding']

                    try:
                        with open(file_path, 'r', encoding=encoding) as f:
                            for line in f:
                                pass
                    except:
                        files_parsing_error.append(file_path)
                    else:
                        files_ok.append(file_path)
                else:
                    files_ok.append(file_path)

    return files_ok, files_parsing_error
Пример #8
0
    def __init__(self, main, title, width = 0, height = 0):
        super().__init__(main)

        self.main = main

        if width:
            self.setFixedWidth(width)
        if height:
            self.setFixedHeight(height)

        self.setWindowTitle(title)
        self.setWindowIcon(QIcon(wl_misc.get_normalized_path('imgs/wl_icon.ico')))
        self.setWindowFlag(Qt.MSWindowsFixedSizeDialogHint, True)
        self.setWindowFlag(Qt.WindowContextHelpButtonHint, False)
Пример #9
0
def check_file_paths_missing(main, file_paths):
    file_paths_missing = []
    file_paths_pass = []

    if file_paths:
        for file_path in file_paths:
            file_path = wl_misc.get_normalized_path(file_path)

            if os.path.exists(file_path):
                file_paths_pass.append(file_path)
            else:
                file_paths_missing.append(file_path)

    return file_paths_pass, file_paths_missing
Пример #10
0
def check_file_paths_empty(main, file_paths):
    file_paths_ok = []
    file_paths_empty = []

    if file_paths:
        for i, file_path in enumerate(file_paths):
            file_path = wl_misc.get_normalized_path(file_path)

            if os.stat(file_path).st_size:
                file_paths_ok.append(file_path)
            else:
                file_paths_empty.append(file_path)

    return file_paths_ok, file_paths_empty
Пример #11
0
    def __init__(self, main, title, width=0, height=0):
        super().__init__(main)

        self.main = main

        if width:
            self.setFixedWidth(width)
        if height:
            self.setFixedHeight(height)

        self.setWindowTitle(title)
        self.setWindowIcon(
            QIcon(wl_misc.get_normalized_path('imgs/wl_icon.ico')))
        # Do not use setWindowFlag, which was added in Qt 5.9 (PyQt 5.8 is used on macOS for compatibility with old macOSes)
        self.setWindowFlags(self.windowFlags()
                            | Qt.MSWindowsFixedSizeDialogHint)
        self.setWindowFlags(self.windowFlags()
                            & ~Qt.WindowContextHelpButtonHint)
Пример #12
0
def check_file_paths_unsupported(main, file_paths):
    file_paths_unsupported = []
    file_paths_pass = []

    file_exts = [
        ext for file_type in main.settings_global['file_types']['files']
        for ext in re.findall(r'(?<=\*)\.[a-z]+', file_type)
    ]

    if file_paths:
        for file_path in file_paths:
            file_path = wl_misc.get_normalized_path(file_path)

            if os.path.splitext(file_path)[1].lower() not in file_exts:
                file_paths_unsupported.append(file_path)
            else:
                file_paths_pass.append(file_path)

    return file_paths_pass, file_paths_unsupported
Пример #13
0
def check_file_paths_empty(main, file_paths):
    file_paths_empty = []
    file_paths_pass = []

    if file_paths:
        for file_path in file_paths:
            file_path = wl_misc.get_normalized_path(file_path)

            # Text files
            if os.path.splitext(file_path)[1] in [
                    '.txt', '.csv', '.htm', '.html', '.xml', '.tmx'
            ]:
                if main.settings_custom['files']['auto_detection_settings'][
                        'detect_encodings']:
                    encoding = wl_detection.detect_encoding(main, file_path)
                else:
                    encoding = main.settings_custom['auto_detection'][
                        'default_settings']['default_encoding']

                try:
                    with open(file_path, 'r', encoding=encoding) as f:
                        empty_file = True

                        for line in f:
                            if line.strip():
                                empty_file = False

                                break

                        if empty_file:
                            file_paths_empty.append(file_path)
                        else:
                            file_paths_pass.append(file_path)
                except:
                    file_paths_pass.append(file_path)
            # Other file types
            else:
                if os.stat(file_path).st_size:
                    file_paths_pass.append(file_path)
                else:
                    file_paths_empty.append(file_path)

    return file_paths_pass, file_paths_empty
Пример #14
0
def check_files_missing(main, files):
    files_missing = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                if os.path.exists(file['path']):
                    files_ok.append(file)
                else:
                    files_missing.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wl_misc.get_normalized_path(file_path)

                if os.path.exists(file_path):
                    files_ok.append(file_path)
                else:
                    files_missing.append(file_path)

    return files_ok, files_missing
Пример #15
0
def check_files_duplicate(main, files):
    files_duplicate = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                if main.wl_files.find_file_by_path(file['path']):
                    files_duplicate.append(file)
                else:
                    files_ok.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wl_misc.get_normalized_path(file_path)

                if main.wl_files.find_file_by_path(file_path):
                    files_duplicate.append(file_path)
                else:
                    files_ok.append(file_path)

    return files_ok, files_duplicate
Пример #16
0
    def restart(self):
        if getattr(sys, '_MEIPASS', False):
            if platform.system() == 'Windows':
                subprocess.Popen([wl_misc.get_normalized_path('Wordless.exe')])
            elif platform.system() == 'Darwin':
                subprocess.Popen([wl_misc.get_normalized_path('Wordless')])
            elif platform.system() == 'Linux':
                subprocess.Popen([wl_misc.get_normalized_path('Wordless')])
        else:
            if platform.system() == 'Windows':
                subprocess.Popen(
                    ['python', wl_misc.get_normalized_path(__file__)])
            elif platform.system() == 'Darwin':
                subprocess.Popen(
                    ['python3',
                     wl_misc.get_normalized_path(__file__)])
            elif platform.system() == 'Linux':
                subprocess.Popen(
                    ['python3.8',
                     wl_misc.get_normalized_path(__file__)])

        self.save_settings()
        sys.exit(0)
Пример #17
0
    def run(self):
        new_files = []

        if self.file_paths:
            len_file_paths = len(self.file_paths)

            for i, file_path in enumerate(self.file_paths):
                self.progress_updated.emit(
                    self.tr(f'Opening files ... ({i + 1}/{len_file_paths})'))

                default_dir = wl_checking_misc.check_dir(
                    self.main.settings_custom['import']['temp_files']
                    ['default_path'])
                default_encoding = self.main.settings_custom['import'][
                    'temp_files']['default_encoding']

                file_path = wl_misc.get_normalized_path(file_path)
                file_name, file_ext = os.path.splitext(
                    os.path.basename(file_path))
                file_ext = file_ext.lower()

                # Text files
                if file_ext == '.txt':
                    new_files.append(self.main.wl_files._new_file(file_path))
                else:
                    if file_ext in ['.docx', '.xlsx', '.xls']:
                        new_path = wl_checking_misc.check_new_path(
                            os.path.join(default_dir, f'{file_name}.txt'))

                        # Word documents
                        if file_ext == '.docx':
                            lines = []

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                doc = docx.Document(file_path)

                                for block in self.iter_block_items(doc):
                                    if type(block
                                            ) == docx.text.paragraph.Paragraph:
                                        f.write(f'{block.text}\n')
                                    elif type(block) == docx.table.Table:
                                        for row in self.iter_visual_cells(
                                                block):
                                            cells = []

                                            for cell in row:
                                                cells.append(' '.join([
                                                    item.text for item in
                                                    self.iter_cell_items(cell)
                                                ]))

                                            f.write('\t'.join(cells) + '\n')

                        # Excel workbooks
                        elif file_ext == '.xlsx':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = openpyxl.load_workbook(
                                    file_path, data_only=True)

                                for worksheet_name in workbook.sheetnames:
                                    worksheet = workbook[worksheet_name]

                                    for row in worksheet.rows:
                                        f.write('\t'.join([(
                                            cell.value if cell.value != None
                                            else '') for cell in row]) + '\n')

                        new_paths = [new_path]
                    else:
                        # Detect encoding
                        if self.main.settings_custom['files'][
                                'auto_detection_settings']['detect_encodings']:
                            encoding_code, _ = wl_detection.detect_encoding(
                                self.main, file_path)
                        else:
                            encoding_code = self.main.settings_custom[
                                'auto_detection']['default_settings'][
                                    'default_encoding']

                        # CSV files
                        if file_ext == '.csv':
                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                with open(file_path,
                                          'r',
                                          newline='',
                                          encoding=encoding_code) as f_csv:
                                    csv_reader = csv.reader(f_csv)

                                    for row in csv_reader:
                                        f.write('\t'.join(row) + '\n')

                            new_paths = [new_path]

                        # HTML files
                        elif file_ext in ['.htm', '.html']:
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml')

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(soup.get_text())

                            new_paths = [new_path]

                        # XML files
                        elif file_ext == '.xml':
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                xml_text = f.read()

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.xml'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(xml_text)

                            new_paths = [new_path]

                        # Translation memory files
                        elif file_ext == '.tmx':
                            lines_src = []
                            lines_target = []

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml-xml')

                                for tu in soup.find_all('tu'):
                                    seg_src, seg_target = tu.find_all('seg')

                                    lines_src.append(seg_src.get_text())
                                    lines_target.append(seg_target.get_text())

                            path_src = wl_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_source.txt'))
                            path_target = wl_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_target.txt'))

                            with open(path_src, 'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_src))
                                f.write('\n')

                            with open(path_target,
                                      'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_target))
                                f.write('\n')

                            new_paths = [path_src, path_target]

                    for new_path in new_paths:
                        new_files.append(
                            self.main.wl_files._new_file(new_path, txt=False))

            self.main.settings_custom['import']['files'][
                'default_path'] = wl_misc.get_normalized_dir(
                    self.file_paths[0])

        self.progress_updated.emit(self.tr('Updating table ...'))

        time.sleep(0.1)

        self.worker_done.emit(new_files)
Пример #18
0
def check_files_empty(main, files):
    files_empty = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                file_path = file['path']

                # Text files
                if os.path.splitext(file_path)[1] in [
                        '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    try:
                        with open(file_path, 'r',
                                  encoding=file['encoding']) as f:
                            empty_file = True

                            for line in f:
                                if line.strip():
                                    empty_file = False

                                    break

                            if empty_file:
                                files_empty.append(file)
                            else:
                                files_ok.append(file)
                    except:
                        files_ok.append(file)
                # Other file types
                else:
                    if os.stat(file_path).st_size:
                        files_ok.append(file)
                    else:
                        files_empty.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wl_misc.get_normalized_path(file_path)

                # Text files
                if os.path.splitext(file_path)[1] in [
                        '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    if main.settings_custom['files'][
                            'auto_detection_settings']['detect_encodings']:
                        encoding, _ = wl_detection.detect_encoding(
                            main, file_path)
                    else:
                        encoding = main.settings_custom['auto_detection'][
                            'default_settings']['default_encoding']

                    try:
                        with open(file_path, 'r', encoding=encoding) as f:
                            empty_file = True

                            for line in f:
                                if line.strip():
                                    empty_file = False

                                    break

                            if empty_file:
                                files_empty.append(file_path)
                            else:
                                files_ok.append(file_path)
                    except:
                        files_ok.append(file_path)
                # Other file types
                else:
                    if os.stat(file_path).st_size:
                        files_ok.append(file_path)
                    else:
                        files_empty.append(file_path)

    return files_ok, files_empty
Пример #19
0
    def __init__(self, loading_window):
        super().__init__()

        self.loading_window = loading_window
        self.threads_check_updates = []

        # Version numbers
        self.ver = wl_misc.get_wl_ver()
        self.ver_major, self.ver_minor, self.ver_patch = wl_misc.split_wl_ver(
            self.ver)

        # Title
        self.setWindowTitle(self.tr(f'Wordless'))

        # Icon
        self.setWindowIcon(
            QIcon(wl_misc.get_normalized_path('imgs/wl_icon.ico')))

        self.loading_window.show_message(self.tr('Loading settings ...'))

        # Default settings
        wl_settings_default.init_settings_default(self)

        # Custom settings
        path_settings = wl_misc.get_normalized_path('wl_settings.pickle')

        if os.path.exists(path_settings):
            with open(path_settings, 'rb') as f:
                settings_custom = pickle.load(f)

            if wl_checking_misc.check_custom_settings(settings_custom,
                                                      self.settings_default):
                self.settings_custom = settings_custom
            else:
                self.settings_custom = copy.deepcopy(self.settings_default)
        else:
            self.settings_custom = copy.deepcopy(self.settings_default)

        # Global settings
        wl_settings_global.init_settings_global(self)

        # Settings
        self.wl_settings = wl_settings.Wl_Settings(self)

        self.loading_window.show_message(
            self.tr('Initializing main window ...'))

        # Menu
        self.init_menu()

        # Work Area & File Area
        self.init_central_widget()

        # Status Bar
        self.statusBar().showMessage(self.tr('Ready!'))

        self.statusBar().setFixedHeight(22)
        self.statusBar().setStyleSheet('''
            QStatusBar {
                background-color: #D0D0D0;
            }
        ''')

        # Check for updates on startup
        self.loading_window.show_message(self.tr('Check for updates ...'))

        if self.settings_custom['general']['update_settings'][
                'check_updates_on_startup']:
            self.dialog_check_updates = self.help_check_updates(
                on_startup=True)

        self.loading_window.show_message(self.tr('Starting Wordless ...'))

        self.load_settings()

        # Fix layout on macOS
        if platform.system() == 'Darwin':
            self.fix_macos_layout(self)
Пример #20
0
    def import_list(self, settings):
        files = []

        if os.path.exists(
                self.main.settings_custom['import'][settings]['default_path']):
            default_dir = self.main.settings_custom['import'][settings][
                'default_path']
        else:
            default_dir = self.main.settings_default['import'][settings][
                'default_path']

        file_paths = QFileDialog.getOpenFileNames(
            self.main, self.tr('Import from File(s)'), default_dir,
            self.tr('Text File (*.txt)'))[0]

        if file_paths:
            self.main.settings_custom['import'][settings][
                'default_path'] = os.path.normpath(
                    os.path.dirname(file_paths[0]))

            # Detect encodings
            if self.main.settings_custom['import'][settings][
                    'detect_encodings']:
                for file_path in file_paths:
                    files.append({
                        'path':
                        wl_misc.get_normalized_path(file_path),
                        'encoding':
                        wl_detection.detect_encoding(self.main, file_path)[0]
                    })
            else:
                for file_path in file_paths:
                    files.append({
                        'path':
                        wl_misc.get_normalized_path(file_path),
                        'encoding':
                        self.main.settings_custom['auto_detection']
                        ['default_settings']['default_encoding']
                    })

            files_ok, files_empty = wl_checking_file.check_files_empty(
                self.main, files)
            files_ok, files_decoding_error = wl_checking_file.check_files_decoding_error(
                self.main, files_ok)

            # Extract file paths
            files_empty = [file['path'] for file in files_empty]
            files_decoding_error = [
                file['path'] for file in files_decoding_error
            ]

            if files_empty or files_decoding_error:
                wl_dialog_error.wl_dialog_error_import(
                    self.main,
                    files_empty=files_empty,
                    files_decoding_error=files_decoding_error)

                wl_msg.wl_msg_import_list_error(self.main)
            else:
                # Check duplicate items
                items_to_import = []
                items_cur = self.get_items()

                num_prev = len(items_cur)

                for file in files_ok:
                    with open(file['path'], 'r',
                              encoding=file['encoding']) as f:
                        for line in f:
                            line = line.strip()

                            if line not in items_cur:
                                items_to_import.append(line)

                self.load_items(
                    collections.OrderedDict.fromkeys(items_to_import))
                self.itemChanged.emit(self.item(0))

                wl_msg.wl_msg_import_list_success(self.main, num_prev,
                                                  len(self.get_items()))
Пример #21
0
def wl_get_stop_word_list(main, lang, stop_word_list = 'default'):
    if stop_word_list == 'default':
        stop_word_list = main.settings_custom['stop_word_lists']['stop_word_lists'][lang]

    if stop_word_list == main.tr('Custom List'):
        stop_word_list = main.settings_custom['stop_word_lists']['custom_lists'][lang]
    else:
        lang_639_1 = wl_conversion.to_iso_639_1(main, lang)

        # Chinese (Simplified)
        if lang_639_1 == 'zh_cn':
            lang_639_1 = 'zh'

        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            cc = opencc.OpenCC('s2tw')

            stop_word_list_zho_cn = wl_get_stop_word_list(
                main,
                lang = 'zho_cn',
                stop_word_list = stop_word_list.replace('Chinese (Traditional)', 'Chinese (Simplified)'))
            stop_word_list = [cc.convert(stop_word) for stop_word in stop_word_list_zho_cn]
        # extra-stopwords
        elif 'extra-stopwords' in stop_word_list:
            LANG_TEXTS = {
                'sqi': 'albanian',
                'ara': 'arabic',
                'hye': 'armenian',
                'eus': 'basque',
                'bel': 'belarusian',
                'ben': 'bengali',
                'bul': 'bulgarian',
                'cat': 'catalan',
                'zho_cn': 'chinese',
                'hrv': 'croatian',
                'ces': 'czech',
                'dan': 'danish',
                'nld': 'dutch',
                'eng': 'english',
                'est': 'estonian',
                'fin': 'finnish',
                'fra': 'french',
                'glg': 'galician',
                'deu': 'german',
                'ell': 'greek',
                'hau': 'hausa',
                'heb': 'hebrew',
                'hin': 'hindi',
                'hun': 'hungarian',
                'isl': 'icelandic',
                'ind': 'indonesian',
                'gle': 'irish',
                'ita': 'italian',
                'jpn': 'japanese',
                'kor': 'korean',
                'kur': 'kurdish',
                'lav': 'latvian',
                'lit': 'lithuanian',
                'msa': 'malay',
                'mar': 'marathi',
                'mon': 'mongolian',
                'nep': 'nepali',
                # Norwegian Bokmål & Norwegian Nynorsk
                'nob': 'norwegian',
                'nno': 'norwegian',
                'fas': 'persian',
                'pol': 'polish',
                'por': 'portuguese',
                'ron': 'romanian',
                'rus': 'russian',
                'srp_cyrl': 'serbian-cyrillic',
                'srp_latn': 'serbian',
                'slk': 'slovak',
                'slv': 'slovenian',
                'spa': 'spanish',
                'swa': 'swahili',
                'swe': 'swedish',
                'tgl': 'tagalog',
                'tel': 'telugu',
                'tha': 'thai',
                'tur': 'turkish',
                'ukr': 'ukranian',
                'urd': 'urdu',
                'vie': 'vietnamese',
                'yor': 'yoruba'
            }

            with open(wl_misc.get_normalized_path(f'stop_word_lists/extra-stopwords/{LANG_TEXTS[lang]}'), 'r', encoding = 'utf_8') as f:
                stop_word_list = [line.rstrip() for line in f if not line.startswith('#')]
        # NLTK
        elif 'NLTK' in stop_word_list:
            LANG_TEXTS = {
                'ara': 'arabic',
                'aze': 'azerbaijani',
                'dan': 'danish',
                'nld': 'dutch',
                'eng': 'english',
                'fin': 'finnish',
                'fra': 'french',
                'deu': 'german',
                'ell': 'greek',
                'hun': 'hungarian',
                'ind': 'indonesian',
                'ita': 'italian',
                'kaz': 'kazakh',
                'nep': 'nepali',
                # Norwegian Bokmål & Norwegian Nynorsk
                'nob': 'norwegian',
                'nno': 'norwegian',
                'por': 'portuguese',
                'ron': 'romanian',
                'rus': 'russian',
                'slv': 'slovene',
                'spa': 'spanish',
                'swe': 'swedish',
                'tgk': 'tajik',
                'tur': 'turkish'
            }

            stop_word_list = nltk.corpus.stopwords.words(LANG_TEXTS[lang])
        # spaCy
        elif 'spaCy' in stop_word_list:
            # Serbian (Cyrillic) & Serbian (Latin)
            if lang_639_1 == 'sr_cyrl':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_word_list = spacy_lang.STOP_WORDS
            elif lang_639_1 == 'sr_latn':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_word_list = spacy_lang.STOP_WORDS
                stop_word_list = wl_text_utils.to_srp_latn(stop_word_list)
            else:
                spacy_lang = importlib.import_module(f'spacy.lang.{lang_639_1}')

                stop_word_list = spacy_lang.STOP_WORDS
        # Stopwords ISO
        elif 'Stopwords ISO' in stop_word_list:
            # Greek (Ancient)
            if lang_639_1 == 'grc':
                lang_639_1 = 'el'

            # Norwegian Bokmål & Norwegian Nynorsk
            if lang_639_1 in ['nb', 'nn']:
                lang_639_1 = 'no'

            with open(wl_misc.get_normalized_path('stop_word_lists/Stopwords ISO/stopwords_iso.json'), 'r', encoding = 'utf_8') as f:
                stop_word_list = json.load(f)[lang_639_1]
        # Thai
        elif stop_word_list == main.tr('PyThaiNLP - Thai Stop Word List'):
            stop_word_list = pythainlp.corpus.common.thai_stopwords()

    # Remove empty tokens
    stop_word_list = [stop_word for stop_word in stop_word_list if stop_word]

    return sorted(set(stop_word_list))
Пример #22
0
def wl_lemmatize(main, tokens, lang, text_type = ('untokenized', 'untagged'), lemmatizer = 'default'):
    empty_offsets = []
    mapping_lemmas = {}
    lemmas = []

    tokens = [str(token) for token in tokens]

    re_tags_all = wl_matching.get_re_tags(main, tags = 'all')
    re_tags_pos = wl_matching.get_re_tags(main, tags = 'pos')
    re_tags_non_pos = wl_matching.get_re_tags(main, tags = 'non_pos')

    if text_type[1] == 'tagged_both':
        tags = [''.join(re.findall(re_tags_all, token)) for token in tokens]
        tokens = [re.sub(re_tags_all, '', token) for token in tokens]
    elif text_type[1] == 'tagged_pos':
        tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens]
        tokens = [re.sub(re_tags_pos, '', token) for token in tokens]
    elif text_type[1] == 'tagged_non_pos':
        tags = [''.join(re.findall(re_tags_non_pos, token)) for token in tokens]
        tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens]
    else:
        tags = [''] * len(tokens)

    # Record empty tokens
    for i, token in reversed(list(enumerate(tokens))):
        if not token.strip():
            empty_offsets.append(i)

            tokens.remove(token)

    wl_text_utils.check_lemmatizers(main, lang)

    if tokens and lang in main.settings_global['lemmatizers']:
        if lemmatizer == 'default':
            lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][lang]

        # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish
        if 'spaCy' in lemmatizer:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            doc = spacy.tokens.Doc(nlp.vocab, words = tokens)
            nlp.tagger(doc)

            lemmas = [token.lemma_ for token in doc]
        # English
        elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'):
            word_net_lemmatizer = nltk.WordNetLemmatizer()

            for token, pos in wl_pos_tagging.wl_pos_tag(
                main, tokens,
                lang = 'eng',
                pos_tagger = 'NLTK - Perceptron POS Tagger',
                tagset = 'universal'
            ):
                if pos == 'ADJ':
                    lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ))
                elif pos in ['NOUN', 'PROPN']:
                    lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN))
                elif pos == 'ADV':
                    lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV))
                elif pos in ['VERB', 'AUX']:
                    lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB))
                else:
                    lemmas.append(word_net_lemmatizer.lemmatize(token))
        # Greek (Ancient)
        elif lemmatizer == main.tr('lemmalist-greek - Greek (Ancient) Lemma List'):
            with open(wl_misc.get_normalized_path('lemmatization/lemmalist-greek/lemmalist-greek.txt'), 'r', encoding = 'utf_8') as f:
                for line in f.readlines():
                    line = line.rstrip()

                    if line:
                        lemma, *words = line.split()

                        for word in words:
                            mapping_lemmas[word] = lemma
        # Russian & Ukrainian
        elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'):
            if lang == 'rus':
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang = 'ru')
            else:
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang = 'uk')

            for token in tokens:
                lemmas.append(morphological_analyzer.parse(token)[0].normal_form)
        # Tibetan
        elif lemmatizer == main.tr('botok - Tibetan Lemmatizer'):
            wl_text_utils.check_word_tokenizers(main,
                                                      lang = 'bod')
            tokens = main.botok_word_tokenizer.tokenize(' '.join(tokens))

            for token in tokens:
                if token.lemma:
                    lemmas.append(token.lemma)
                else:
                    lemmas.append(token.text)
        # Other Languages
        elif 'Lemmatization Lists' in lemmatizer:
            lang = wl_conversion.to_iso_639_1(main, lang)

            with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f:
                for line in f:
                    try:
                        lemma, word = line.rstrip().split('\t')

                        mapping_lemmas[word] = lemma
                    except:
                        pass
    else:
        lemmas = tokens

    if mapping_lemmas:
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Insert empty lemmas
    for empty_offset in sorted(empty_offsets):
        lemmas.insert(empty_offset, '')

    return [lemma + tag for lemma, tag in zip(lemmas, tags)]
Пример #23
0
def wl_lemmatize_tokens(main, tokens, lang, tokenized, tagged, lemmatizer):
    empty_offsets = []
    lemmas = []

    tokens = [str(token) for token in tokens]

    re_tags = wl_matching.get_re_tags(main, tag_type = 'body')

    if tagged == _tr('wl_lemmatize_tokens', 'Yes'):
        tags = [''.join(re.findall(re_tags, token)) for token in tokens]
        tokens = [re.sub(re_tags, '', token) for token in tokens]
    else:
        tags = [''] * len(tokens)

    # Record empty tokens with their tags
    for i, token in reversed(list(enumerate(tokens))):
        if not token.strip():
            empty_offsets.append(i)

            del tokens[i]
            del tags[i]

    # spaCy
    if 'spacy' in lemmatizer:
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if lang != 'jpn':
            doc = spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [False] * len(tokens))

            for pipe_name in nlp.pipe_names:
                nlp.get_pipe(pipe_name)(doc)
        # The Japanese model do not have a lemmatizer component and Japanese lemmas are taken directly from SudachiPy
        # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1923647
        else:
            doc = nlp(''.join(tokens))

        lemma_tokens = [token.text for token in doc]
        lemmas = [token.lemma_ for token in doc]
    # English
    elif lemmatizer == 'nltk_wordnet':
        word_net_lemmatizer = nltk.WordNetLemmatizer()

        for token, pos in wl_pos_tagging.wl_pos_tag(
            main, tokens,
            lang = 'eng_us',
            pos_tagger = 'nltk_perceptron',
            tagset = 'universal'
        ):
            if pos == 'ADJ':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ))
            elif pos in ['NOUN', 'PROPN']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN))
            elif pos == 'ADV':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV))
            elif pos in ['VERB', 'AUX']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB))
            else:
                lemmas.append(word_net_lemmatizer.lemmatize(token))

        lemma_tokens = tokens.copy()
    # Japanese
    elif lemmatizer == 'sudachipy_jpn':
        tokens_retokenized = main.sudachipy_word_tokenizer.tokenize(''.join(tokens))

        lemma_tokens = [token.surface() for token in tokens_retokenized]
        lemmas = [token.dictionary_form() for token in tokens_retokenized]
    # Russian & Ukrainian
    elif lemmatizer == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        for token in tokens:
            lemmas.append(morphological_analyzer.parse(token)[0].normal_form)

        lemma_tokens = tokens.copy()
    # Tibetan
    elif lemmatizer == 'botok_bod':
        lemma_tokens = []
        tokens_retokenized = main.botok_word_tokenizer.tokenize(''.join(tokens))

        for token in tokens_retokenized:
            if token.lemma:
                lemmas.append(token.lemma)
            else:
                lemmas.append(token.text)

            lemma_tokens.append(token.text)
    # Lemmatization Lists
    elif 'lemmatization_lists' in lemmatizer:
        mapping_lemmas = {}

        lang = wl_conversion.to_iso_639_1(main, lang)
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f:
            for line in f:
                try:
                    lemma, word = line.rstrip().split('\t')

                    mapping_lemmas[word] = lemma
                except ValueError:
                    pass

        lemma_tokens = tokens.copy()
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Remove empty lemmas and strip whitespace in tokens
    for i, lemma in reversed(list(enumerate(lemmas))):
        lemma_tokens[i] = lemma_tokens[i].strip()
        lemmas[i] = lemma.strip()

        if not lemmas[i]:
            del lemmas[i]
            del lemma_tokens[i]

    # Make sure that tokenization is not modified during lemmatization
    i_tokens = 0
    i_lemmas = 0

    len_tokens = len(tokens)
    len_lemmas = len(lemmas)

    if len_tokens != len_lemmas:
        tags_modified = []
        lemmas_modified = []

        while i_tokens < len_tokens and i_lemmas < len_lemmas:
            # Different token
            if len(tokens[i_tokens]) != len(lemma_tokens[i_lemmas]):
                tokens_temp = [tokens[i_tokens]]
                tags_temp = [tags[i_tokens]]
                lemma_tokens_temp = [lemma_tokens[i_lemmas]]
                lemmas_temp = [lemmas[i_lemmas]]

                # Align tokens
                while i_tokens < len_tokens - 1 or i_lemmas < len_lemmas - 1:
                    len_tokens_temp = sum([len(token) for token in tokens_temp])
                    len_lemma_tokens_temp = sum([len(token) for token in lemma_tokens_temp])

                    if len_tokens_temp > len_lemma_tokens_temp:
                        lemma_tokens_temp.append(lemma_tokens[i_lemmas + 1])
                        lemmas_temp.append(lemmas[i_lemmas + 1])

                        i_lemmas += 1
                    elif len_tokens_temp < len_lemma_tokens_temp:
                        tokens_temp.append(tokens[i_tokens + 1])
                        tags_temp.append(tags[i_tokens + 1])

                        i_tokens += 1
                    else:
                        # Use lemmas in one-to-one
                        if len(tokens_temp) == len(lemma_tokens_temp):
                            tags_modified.extend(tags_temp)
                            lemmas_modified.extend(lemmas_temp)
                        # Use original tokens in many-to-one or one-to-many
                        else:
                            tags_modified.extend(tags)
                            lemmas_modified.extend(tokens_temp)

                        tokens_temp = []
                        tags_temp = []
                        lemma_tokens_temp = []
                        lemmas_temp = []

                        break

                if tokens_temp:
                    # Use lemmas in one-to-one
                    if len(tokens_temp) == len(lemma_tokens_temp):
                        tags_modified.extend(tags_temp)
                        lemmas_modified.extend(lemmas_temp)
                    # Use original tokens in many-to-one or one-to-many
                    else:
                        tags_modified.extend(tags)
                        lemmas_modified.extend(tokens_temp)
            else:
                tags_modified.extend(tags[i_tokens])
                lemmas_modified.append(lemmas[i_lemmas])

            i_tokens += 1
            i_lemmas += 1

        len_lemmas_modified = len(lemmas_modified)

        if len_tokens < len_lemmas_modified:
            tags = tags_modified[:len_tokens]
            lemmas = lemmas_modified[:len_tokens]
        elif len_tokens > len_lemmas_modified:
            tags = tags_modified + [tags_modified[-1]] * (len_tokens - len_lemmas_modified)
            lemmas = lemmas_modified + [lemmas_modified[-1]] * (len_tokens - len_lemmas_modified)
        else:
            tags = tags_modified.copy()
            lemmas = lemmas_modified.copy()

    # Insert empty lemmas and their tags after alignment of input and output
    for empty_offset in sorted(empty_offsets):
        lemmas.insert(empty_offset, '')
        tags.insert(empty_offset, '')

    return [lemma + tag for lemma, tag in zip(lemmas, tags)]
Пример #24
0
    def run(self):
        new_files = []

        files_detection_error_encoding = []
        files_detection_error_text_type = []
        files_detection_error_lang = []

        if self.file_paths:
            len_file_paths = len(self.file_paths)

            for i, file_path in enumerate(self.file_paths):
                self.progress_updated.emit(
                    self.tr(f'Opening files ... ({i + 1}/{len_file_paths})'))

                default_dir = wl_checking_misc.check_dir(
                    self.main.settings_custom['import']['temp_files']
                    ['default_path'])
                default_encoding = self.main.settings_custom['import'][
                    'temp_files']['default_encoding']

                file_path = wl_misc.get_normalized_path(file_path)
                file_name, file_ext = os.path.splitext(
                    os.path.basename(file_path))
                file_ext = file_ext.lower()

                # Text files
                if file_ext == '.txt':
                    (new_file, detection_success_encoding,
                     detection_success_text_type, detection_success_lang
                     ) = self.main.wl_files._new_file(file_path)

                    new_files.append(new_file)

                    if not detection_success_encoding:
                        files_detection_error_encoding.append(new_file['path'])

                    if not detection_success_text_type:
                        files_detection_error_text_type.append(
                            new_file['path'])

                    if not detection_success_lang:
                        files_detection_error_lang.append(new_file['path'])
                else:
                    if file_ext in ['.docx', '.xlsx', '.xls']:
                        new_path = wl_checking_misc.check_new_path(
                            os.path.join(default_dir, f'{file_name}.txt'))

                        # Word documents
                        if file_ext == '.docx':
                            lines = []

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                doc = docx.Document(file_path)

                                for block in self.iter_block_items(doc):
                                    if type(block
                                            ) == docx.text.paragraph.Paragraph:
                                        f.write(f'{block.text}\n')
                                    elif type(block) == docx.table.Table:
                                        for row in self.iter_visual_cells(
                                                block):
                                            cells = []

                                            for cell in row:
                                                cells.append(' '.join([
                                                    item.text for item in
                                                    self.iter_cell_items(cell)
                                                ]))

                                            f.write('\t'.join(cells) + '\n')

                        # Excel workbooks
                        elif file_ext == '.xlsx':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = openpyxl.load_workbook(
                                    file_path, data_only=True)

                                for worksheet_name in workbook.sheetnames:
                                    worksheet = workbook[worksheet_name]

                                    for row in worksheet.rows:
                                        f.write('\t'.join([(
                                            cell.value if cell.value != None
                                            else '') for cell in row]) + '\n')
                        elif file_ext == '.xls':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = xlrd.open_workbook(file_path)

                                for i_sheet in range(workbook.nsheets):
                                    worksheet = workbook.sheet_by_index(
                                        i_sheet)

                                    for row in range(worksheet.nrows):
                                        f.write('\t'.join([
                                            worksheet.cell_value(row, col)
                                            for col in range(worksheet.ncols)
                                        ]) + '\n')

                        new_paths = [new_path]
                    else:
                        # Detect encoding
                        if self.main.settings_custom['files'][
                                'auto_detection_settings']['detect_encodings']:
                            encoding_code, _ = wl_detection.detect_encoding(
                                self.main, file_path)
                        else:
                            encoding_code = self.main.settings_custom[
                                'auto_detection']['default_settings'][
                                    'default_encoding']

                        # CSV files
                        if file_ext == '.csv':
                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                with open(file_path,
                                          'r',
                                          newline='',
                                          encoding=encoding_code) as f_csv:
                                    csv_reader = csv.reader(f_csv)

                                    for row in csv_reader:
                                        f.write('\t'.join(row) + '\n')

                            new_paths = [new_path]

                        # HTML files
                        elif file_ext in ['.htm', '.html']:
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml')

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(soup.get_text())

                            new_paths = [new_path]

                        # XML files
                        elif file_ext == '.xml':
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                xml_text = f.read()

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(xml_text)

                            new_paths = [new_path]

                        # Translation memory files
                        elif file_ext == '.tmx':
                            lines_src = []
                            lines_target = []

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml-xml')

                                for tu in soup.find_all('tu'):
                                    seg_src, seg_target = tu.find_all('seg')

                                    lines_src.append(seg_src.get_text())
                                    lines_target.append(seg_target.get_text())

                            path_src = wl_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_source.txt'))
                            path_target = wl_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_target.txt'))

                            with open(path_src, 'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_src))
                                f.write('\n')

                            with open(path_target,
                                      'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_target))
                                f.write('\n')

                            new_paths = [path_src, path_target]

                        # Lyrics files
                        elif file_ext == '.lrc':
                            lyrics = {}

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                for line in f:
                                    time_tags = []

                                    line = line.strip()

                                    # Strip time tags
                                    while re.search(r'^\[[^\]]+?\]', line):
                                        time_tags.append(
                                            re.search(r'^\[[^\]]+?\]',
                                                      line).group())

                                        line = line[len(time_tags[-1]):].strip(
                                        )

                                    # Strip word time tags
                                    line = re.sub(r'<[^>]+?>', r'', line)
                                    line = re.sub(r'\s{2,}', r' ',
                                                  line).strip()

                                    for time_tag in time_tags:
                                        if re.search(
                                                r'^\[[0-9]{2}:[0-5][0-9]\.[0-9]{2}\]$',
                                                time_tag):
                                            lyrics[time_tag] = line

                            new_path = wl_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                for _, lyrics in sorted(lyrics.items()):
                                    f.write(f'{lyrics}\n')

                            new_paths = [new_path]

                    for new_path in new_paths:
                        (new_file, detection_success_encoding,
                         detection_success_text_type, detection_success_lang
                         ) = self.main.wl_files._new_file(new_path)

                        new_files.append(new_file)

                        if not detection_success_encoding:
                            files_detection_error_encoding.append(
                                new_file['path'])

                        if not detection_success_text_type:
                            files_detection_error_text_type.append(
                                new_file['path'])

                        if not detection_success_lang:
                            files_detection_error_lang.append(new_file['path'])

            self.main.settings_custom['import']['files'][
                'default_path'] = wl_misc.get_normalized_dir(
                    self.file_paths[0])

        self.progress_updated.emit(self.tr('Updating table ...'))

        time.sleep(0.1)

        self.worker_done.emit(new_files, files_detection_error_encoding,
                              files_detection_error_text_type,
                              files_detection_error_lang)
Пример #25
0
def wl_lemmatize_text(main, text, lang, tokenized, tagged, lemmatizer):
    lemmas = []

    # spaCy
    if lemmatizer.startswith('spacy_'):
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)

        lemmas = [token.lemma_ for token in doc]
    # English
    elif lemmatizer == 'nltk_wordnet':
        word_net_lemmatizer = nltk.WordNetLemmatizer()

        for token, pos in wl_pos_tagging.wl_pos_tag(
            main, text,
            lang = 'eng_us',
            pos_tagger = 'nltk_perceptron',
            tagset = 'universal'
        ):
            if pos == 'ADJ':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ))
            elif pos in ['NOUN', 'PROPN']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN))
            elif pos == 'ADV':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV))
            elif pos in ['VERB', 'AUX']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB))
            else:
                lemmas.append(word_net_lemmatizer.lemmatize(token))
    # Japanese
    elif lemmatizer == 'sudachipy_jpn':
        lemmas = [
            token.dictionary_form()
            for token in main.sudachipy_word_tokenizer.tokenize(text)
        ]
    # Russian & Ukrainian
    elif lemmatizer == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang)

        for token in tokens:
            lemmas.append(morphological_analyzer.parse(token)[0].normal_form)
    # Tibetan
    elif lemmatizer == 'botok_bod':
        tokens = main.botok_word_tokenizer.tokenize(text)

        for token in tokens:
            if token.lemma:
                lemmas.append(token.lemma)
            else:
                lemmas.append(token.text)
    # Lemmatization Lists
    elif lemmatizer.startswith('lemmatization_lists_'):
        mapping_lemmas = {}

        lang = wl_conversion.to_iso_639_1(main, lang)
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f:
            for line in f:
                try:
                    lemma, word = line.rstrip().split('\t')

                    mapping_lemmas[word] = lemma
                except ValueError:
                    pass

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang)
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Remove empty lemmas and strip whitespace in tokens
    lemmas = [
        str(lemma).strip()
        for lemma in lemmas
        if str(lemma).strip()
    ]

    return lemmas
Пример #26
0
def get_path(file_name):
    return wl_misc.get_normalized_path(
        f'wl_tests/files/wl_checking/wl_checking_file/{file_name}')