def __init__( self, language_code: str, threshold: float = 0.9, model_path: str = '', model_url: str = '', delimiter: Optional[str] = None, delimited_position: int = -1, ): self.language_code = language_code self.threshold = threshold self.delimiter = delimiter self.delimited_position = delimited_position self.model_path = model_path self.model_url = model_url if not self.model_url: # https://fasttext.cc/docs/en/language-identification.html self.model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' if not self.model_path: # TODO: log model_path self.model_path = get_temp_file_path() if not os.path.exists(self.model_path): download_file(self.model_url, self.model_path) self.ft = load_model(self.model_path)
def file_processing(self, stage: str) -> None: # TODO: tqdm + logger for processor_name in self.config[stage]: processor = processors_dict[processor_name]() temp_file_path = get_temp_file_path(self.config) try: result = processor.process_file(self.input_file, temp_file_path) except OSError as exc: # TODO: logging os.remove(temp_file_path) raise TDCOSError(exc) if not result: raise TDCRuntimeError(f'After "{stage}" stage by "{processor_name}" processor result file is empty') # TODO: log lines count self._remove_previous_temp(temp_file_path) self.input_file = temp_file_path
def line_processing(self) -> None: temp_file_path = get_temp_file_path(self.config) processors = self._get_line_processors() # TODO: codecs? with open(self.input_file, encoding='utf-8') as fdr: with open(temp_file_path, 'w', encoding='utf-8') as fdw: # TODO: tqdm + logger.debug for line in fdr: if not line: continue for proc in processors: line = proc.process_line(line) # type: ignore # TODO: log processed line if not line: # empty or is None break # save after all processors if line: # not empty and is not None fdw.write(f'{line}\n') # TODO: check need remove old input_file self.input_file = temp_file_path
def __init__(self, language_code: str, mode: str, replace_with: str = ' '): allowed_language = [ # https://github.com/6/stopwords-json/tree/master/dist # Run in Dev Browser Console: # var l = ''; # $x("//a[starts-with(@href, '/6/stopwords-json/blob/master/dist/')]/@href").forEach(function(el) { # var code = el.textContent.replace('/6/stopwords-json/blob/master/dist/', '').replace('.json', ''); # languages = languages + "'" + code + "',\n"; # }); # console.log(languages); 'af', 'ar', 'bg', 'bn', 'br', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'ga', 'gl', 'ha', 'he', 'hi', 'hr', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'la', 'lv', 'mr', 'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'so', 'st', 'sv', 'sw', 'th', 'tr', 'yo', 'zh', 'zu', ] if language_code not in allowed_language: msg = f'Wrong language for {self.name} processor: {language_code}, allowed only: {allowed_language}' raise TDCValueError(msg) self.language_code = language_code url = f'https://raw.githubusercontent.com/6/stopwords-json/master/dist/{self.language_code}.json' temp_file = get_temp_file_path() # FIXME: write & read? Better download to variable download_file(url, temp_file) with open(temp_file, encoding='utf-8') as fd: stop_words = fd.read() stop_words = json.loads(stop_words) stop_words_uniq = set(word.replace('|', '') for word in stop_words) stop_words = '|'.join(stop_words_uniq) stop_words_regex = rf'\b({stop_words})\b' self.stop_words_re = re.compile(stop_words_regex, flags=re.UNICODE | re.IGNORECASE) allowed = ['remove_line', 'replace'] if mode not in allowed: raise TDCValueError( f'Wrong mode for {self.name} processor: {mode}, allowed only: {allowed}' ) self.mode = mode self.replace_with = replace_with