示例#1
0
    def __init__(
        self,
        language_code: str,
        threshold: float = 0.9,
        model_path: str = '',
        model_url: str = '',
        delimiter: Optional[str] = None,
        delimited_position: int = -1,
    ):

        self.language_code = language_code
        self.threshold = threshold

        self.delimiter = delimiter
        self.delimited_position = delimited_position

        self.model_path = model_path
        self.model_url = model_url

        if not self.model_url:
            # https://fasttext.cc/docs/en/language-identification.html
            self.model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'

        if not self.model_path:
            # TODO: log model_path
            self.model_path = get_temp_file_path()

        if not os.path.exists(self.model_path):
            download_file(self.model_url, self.model_path)

        self.ft = load_model(self.model_path)
    def file_processing(self, stage: str) -> None:
        # TODO: tqdm + logger
        for processor_name in self.config[stage]:
            processor = processors_dict[processor_name]()

            temp_file_path = get_temp_file_path(self.config)
            try:
                result = processor.process_file(self.input_file, temp_file_path)
            except OSError as exc:
                # TODO: logging
                os.remove(temp_file_path)
                raise TDCOSError(exc)

            if not result:
                raise TDCRuntimeError(f'After "{stage}" stage by "{processor_name}" processor result file is empty')

            # TODO: log lines count

            self._remove_previous_temp(temp_file_path)
            self.input_file = temp_file_path
    def line_processing(self) -> None:
        temp_file_path = get_temp_file_path(self.config)
        processors = self._get_line_processors()

        # TODO: codecs?
        with open(self.input_file, encoding='utf-8') as fdr:
            with open(temp_file_path, 'w', encoding='utf-8') as fdw:
                # TODO: tqdm + logger.debug
                for line in fdr:
                    if not line:
                        continue

                    for proc in processors:
                        line = proc.process_line(line)  # type: ignore
                        # TODO: log processed line
                        if not line:  # empty or is None
                            break

                    # save after all processors
                    if line:  # not empty and is not None
                        fdw.write(f'{line}\n')

        # TODO: check need remove old input_file
        self.input_file = temp_file_path
示例#4
0
    def __init__(self, language_code: str, mode: str, replace_with: str = ' '):
        allowed_language = [
            # https://github.com/6/stopwords-json/tree/master/dist
            # Run in Dev Browser Console:
            # var l = '';
            # $x("//a[starts-with(@href, '/6/stopwords-json/blob/master/dist/')]/@href").forEach(function(el) {
            #   var code = el.textContent.replace('/6/stopwords-json/blob/master/dist/', '').replace('.json', '');
            #   languages = languages + "'" + code + "',\n";
            # });
            # console.log(languages);
            'af',
            'ar',
            'bg',
            'bn',
            'br',
            'ca',
            'cs',
            'da',
            'de',
            'el',
            'en',
            'eo',
            'es',
            'et',
            'eu',
            'fa',
            'fi',
            'fr',
            'ga',
            'gl',
            'ha',
            'he',
            'hi',
            'hr',
            'hu',
            'hy',
            'id',
            'it',
            'ja',
            'ko',
            'la',
            'lv',
            'mr',
            'nl',
            'no',
            'pl',
            'pt',
            'ro',
            'ru',
            'sk',
            'sl',
            'so',
            'st',
            'sv',
            'sw',
            'th',
            'tr',
            'yo',
            'zh',
            'zu',
        ]
        if language_code not in allowed_language:
            msg = f'Wrong language for {self.name} processor: {language_code}, allowed only: {allowed_language}'
            raise TDCValueError(msg)
        self.language_code = language_code

        url = f'https://raw.githubusercontent.com/6/stopwords-json/master/dist/{self.language_code}.json'
        temp_file = get_temp_file_path()

        # FIXME: write & read? Better download to variable
        download_file(url, temp_file)
        with open(temp_file, encoding='utf-8') as fd:
            stop_words = fd.read()

        stop_words = json.loads(stop_words)
        stop_words_uniq = set(word.replace('|', '') for word in stop_words)
        stop_words = '|'.join(stop_words_uniq)
        stop_words_regex = rf'\b({stop_words})\b'
        self.stop_words_re = re.compile(stop_words_regex,
                                        flags=re.UNICODE | re.IGNORECASE)

        allowed = ['remove_line', 'replace']
        if mode not in allowed:
            raise TDCValueError(
                f'Wrong mode for {self.name} processor: {mode}, allowed only: {allowed}'
            )

        self.mode = mode
        self.replace_with = replace_with