def get_datetime(self, line: str): line_clean = clean_string(line, charset=self.ALLOWED_CHARS_ALPHANUM) date_items = re.findall(r"\d{1,8}", line_clean) date_value = date_items[0] if len(date_items) >= 2 else "" if date_value.isdigit() and len(date_value) >= 8: return datetime.strptime(date_items[0], '%d%m%Y') return None
def _is_words_and_pay_line(self, first_desire_word: str, line: str) -> bool: cleaned_line = clean_string( self._remove_spaces_and_to_lower(line), charset="abcdefghijklmnopqrstuvwxyz0123456789.") # Does have the right body? if ((not (re.search(r"[a-z]+\d+\.\d{2}", cleaned_line) or re.search(r"[a-z]+\d{3,}", cleaned_line))) or re.search(r"^[a-z]+$", cleaned_line) or re.search(r"^\d+$", cleaned_line) or re.search(r"^\d+\.\d+$", cleaned_line)): return False # Try to guess if first chars are: first_desire_word firs_input_word = "" # Does it has dot? if "." not in cleaned_line: firs_input_word = re.split(r"\d{3,}", cleaned_line)[0] else: firs_input_word = re.split(r"\d+\.\d{2}", cleaned_line)[0] if not firs_input_word: return False return self._is_it_similar( firs_input_word, first_desire_word) or self._is_it_phonetic_similar( firs_input_word, first_desire_word)
def clean_line(self, line: str, charset: str = None) -> str: """ Remove not desire chars from a string """ return clean_string( line.upper(), charset=charset or "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", )
def get_amount_format(self, line): value = clean_string(line, charset=self.ALLOWED_CHARS_ALPHANUM) amount = self.get_numbers(value) if not amount: return None amount_str = str(amount[0]) insert_point = len(amount_str) - 2 return amount_str[:insert_point] + "." + amount_str[insert_point:]
def get_next_line(self, list_data: list): if len(list_data) >= 1: line_data = list_data.pop(0) text = line_data.get("text") #print(f"get_next_line: {text}") return Line( uuid=line_data.get("uuid"), text=text, text_clean=clean_string( text, charset=self.ALLOWED_CHARS_ALPHANUM).upper(), text_len=len(text), ) return Line(uuid="-1")
def get_date(self, line: str) -> str: """ Get the date value of FECHA 26/11/2020 09:35:53 """ cleaned_line = clean_string(line.upper(), charset="0123456789 :-/+.") if line == "": return None date_values = {"year": 2020, "month": 1, "day": 1} date = re.split(r'-|/|\s|\+', cleaned_line) cleaned_line = self._translate_line(cleaned_line) time_items = re.split(r":|\.", cleaned_line) try: if len(date) >= 4: year = re.findall(r"\d{4}|\d{2}", date[-2])[0] date_values.update({ "year": int(year), "month": int(date[-2]), "day": int(date[-4]), "hour": int(time_items[-3]), "minute": int(time_items[-2]), "second": int(time_items[-1]) }) else: year = re.findall(r"\d{4}|\d{2}", date[-1])[0] date_values.update({ "year": int(year), "month": int(date[-2]), "day": int(date[-3]), "hour": int(time_items[-3]), "minute": int(time_items[-2]), "second": int(time_items[-1]) }) return datetime(**date_values) except ValueError: return None except IndexError: return None return None
def get_quantity_without_text(self, text: str, number_zero: int): quantitys = clean_string(text, "0123456789. ") return self.format_quantity(quantitys, number_zero)
def clean_headers(headers): headers = [clean_string(h) for h in headers] headers = [title_to_snake(h) for h in headers] return headers