Пример #1
0
 def __init__(self, path_: Union[str, Path], content_file: Union[Sequence, str, Any] = None):
     self._last_update = None
     if isinstance(path_, Path):
         self.__path = path_
     else:
         self.__path = Path(normalize_path(path_))
     if self.__path.exists:
         assert not self.__path.is_dir, f'Path is a directory. Change your path. {self.__path}'
         self._last_update = self.updated_at
     if self._allowed_ext:
         assert self.ext in self._allowed_ext, ValueError(
                 f'type of file {self.ext} not allowed. Only allowed {self._allowed_ext}')
     if not content_file:
         content_file = []
     assert self.ext != '', ValueError(
             'You need to inform the file extension on path e.g (.json, .txt, .xyz, etc.).')
     self._lines = self.normalize_data(content_file)
     if not self.is_empty:
         line_sep_ = self._default_new_line_sep
         self.set_new_line_sep(line_sep_)
     else:
         self._new_line_sep = self._default_new_line_sep
     self._current_change = 0
     self._max_history_length = 50
     self._change_history = []
     self._set_change('_lines', self._lines.copy())
Пример #2
0
 def __init__(self,
              path_: Union[str, Path],
              content_file: Union[Sequence, str, Any] = None,
              **kwargs):
     self._last_update = None
     self._is_byte = kwargs.get('is_byte', False)
     self._can_edit = not self._is_byte
     if isinstance(path_, Path):
         self.__path = path_
     else:
         self.__path = Path(normalize_path(path_))
     if self.__path.exists:
         assert not self.__path.is_dir, f'Path is a directory. Change your path. {self.__path}'
         self._last_update = self.updated_at
     if self._allowed_ext:
         assert self.ext in self._allowed_ext, ValueError(
             f'type of file {self.ext} not allowed. Only allowed {self._allowed_ext}'
         )
     if not content_file:
         content_file = []
     self._lines = self.normalize_data(content_file)
     if not self.is_empty:
         line_sep_ = self._default_new_line_sep
         self.set_new_line_sep(line_sep_)
     else:
         self._new_line_sep = self._default_new_line_sep
     self._current_change = 0
     self._max_history_length = 50
     self._change_history = []
     self._set_change('_lines', self._lines.copy())
Пример #3
0
 def test_sanity(self):
     p_test = 'cereja/test/sanity'
     p = Path(p_test)
     self.assertTrue(p.name, 'sanity')
     self.assertTrue(p.parent.name, 'test')
     self.assertTrue(p == p_test)
     self.assertTrue('sanity' in p)
     p = p + ['con', 'cat']
     p_test = Path('cereja/test/sanity').join('con', 'cat')
     self.assertTrue(p == p_test)
Пример #4
0
    def __new__(cls, path_: str, *args,
                **kwargs) -> Union[FileBase, JsonFile, CsvFile]:
        """
        Create instance based

        :param path_: File path
        :param kwargs:
            content_file: file content depends on the type of file
            data: file content depends on the type of file
            fieldnames: only .csv data.
        """
        path_ = Path(path_)
        if not args:
            if 'content_file' in kwargs and 'data' in kwargs:
                raise ValueError("Cannot send content_file and data")
            content_file = kwargs.get('content_file') or kwargs.get('data')
        else:
            content_file = args[0]

        if path_.exists:
            logger.warning(
                f'You are creating a new file, but file path {str(path_)} already exists. \nIf you want to read or '
                f'write the content of file use <File.read>')

        if path_.suffix == '.csv':
            fieldnames = kwargs.get('fieldnames')
            return CsvFile(path_, fieldnames=fieldnames, data=content_file)
        elif path_.suffix == '.json':
            return JsonFile(path_, data=content_file)
        elif path_.suffix == '.txt':
            return TxtFile(path_, content_file=content_file)
        return FileBase(path_=path_, content_file=content_file)
Пример #5
0
 def load_files(cls, path_, ext, contains_in_name: List = (), not_contains_in_name=(), take_empty=True,
                recursive=False):
     path_ = Path(path_)
     if path_.is_dir:
         path_ = [i for i in listdir(path_)]
     if not isinstance(path_, list):
         path_ = [path_]
     loaded = []
     for p in path_:
         if recursive and p.is_dir:
             loaded.extend(cls.load_files(p, ext))
             continue
         if not p.exists or p.is_dir:
             continue
         file_ = cls.load(p)
         if file_ is None:
             continue
         if take_empty is True and file_.is_empty:
             continue
         if not (file_.ext == f'.{ext.strip(".")}'):
             continue
         if contains_in_name:
             if not any(map(file_.file_name_without_ext.__contains__, contains_in_name)):
                 continue
         if not_contains_in_name:
             if any(map(file_.file_name_without_ext.__contains__, not_contains_in_name)):
                 continue
         loaded.append(file_)
     return loaded
Пример #6
0
    def load(cls, path_: Union[str, Path], **kwargs):
        """
        Read and create new file object.

        :param path_: File Path
        :param kwargs:
                encoding: utf-8 is default
                mode: r+ is default
                newline: '' is default
        :return: File object
        """
        path_ = Path(path_)
        assert path_.exists, FileNotFoundError('No such file', path_)
        encoding = kwargs.pop('encoding') if 'encoding' in kwargs else 'utf-8'
        mode = kwargs.pop('mode') if 'mode' in kwargs else 'r+'
        newline = kwargs.pop('newline') if 'newline' in kwargs else ''
        if path_.suffix in cls._dont_read:
            logger.warning(f"I can't read this file. See class attribute <{cls.__name__}._dont_read>")
            return
        try:
            with open(path_, mode=mode, encoding=encoding, newline=newline, **kwargs) as fp:
                content = fp.read()
        except PermissionError as err:
            logger.error(err)
            return

        return cls(path_, content)
Пример #7
0
 def load(cls, path_: Union[str, Path], **kwargs):
     encoding = kwargs.pop('encoding') if 'encoding' in kwargs else 'utf-8'
     path_ = Path(path_)
     assert path_.exists, FileNotFoundError('No such file', path_)
     assert path_.suffix == '.json', "isn't .json file."
     with open(path_, encoding=encoding, **kwargs) as fp:
         data = json.load(fp)
     return cls(path_, data=data)
Пример #8
0
    def save_freq(self, save_on: str, prefix='freq', ext: str = 'json', probability=False):
        ext = ext.strip('.')  # normalize
        save_on = Path(save_on)

        path_words = save_on.join(f'{prefix}_words.{ext}')
        self.words_freq.to_json(path_words, probability=probability, exist_ok=True)

        path_phrases = save_on.join(f'{prefix}_phrases.{ext}')
        self.phrases_freq.to_json(path_phrases, probability=probability, exist_ok=True)
Пример #9
0
 def load(cls, path_: str, has_col=True, encoding='utf-8', **kwargs):
     path_ = Path(path_)
     assert path_.exists, FileNotFoundError('No such file', path_)
     if path_.suffix != '.csv':
         raise ValueError("isn't .csv file.")
     with open(path_.path, encoding=encoding, newline='') as fp:
         reader = csv.reader(fp)
         fields = None
         if has_col:
             fields = next(reader)
         data = list(reader)
     return cls(path_, fieldnames=fields, data=data)
Пример #10
0
    def load(cls, path_: str, **kwargs):
        """
        Read and create instance based on extension.

        :param path_: File path
        :param kwargs:
        :return:
        """
        encoding = kwargs.pop('encoding') if 'encoding' in kwargs else 'utf-8'
        mode = kwargs.pop('mode') if 'mode' in kwargs else 'r+'
        newline = kwargs.pop('newline') if 'newline' in kwargs else ''
        path_ = Path(path_)
        assert path_.exists, FileNotFoundError('No such file', path_)
        return cls.__classes_by_ext.get(path_.suffix, FileBase).load(path_=path_, mode=mode, encoding=encoding,
                                                                     newline=newline,
                                                                     **kwargs)
Пример #11
0
    def save(self, save_on_dir: str, take_split: bool = True, test_max_size: int = None, source_vocab_size: int = None,
             target_vocab_size: int = None, shuffle=True, prefix=None, ext='align', **kwargs):
        save_on_dir = Path(save_on_dir)
        if take_split:
            x_train, y_train, x_test, y_test = self.split_data(test_max_size=test_max_size,
                                                               source_vocab_size=source_vocab_size,
                                                               target_vocab_size=target_vocab_size,
                                                               take_parallel_data=False,
                                                               shuffle=shuffle)
            train_prefix, test_prefix = (f'{prefix}_train', f'{prefix}_test') if prefix is not None else (
                'train', 'test')
            data_to_save = ((train_prefix, x_train, y_train), (test_prefix, x_test, y_test))
        else:
            data_to_save = ((prefix, self.source.data, self.target.data),)

        for prefix, x, y in data_to_save:
            save_on = save_on_dir.join(f'{prefix}_{self.source_language}.{ext.strip(".")}')
            File(save_on, content_file=x).save(**kwargs)
            save_on = save_on_dir.join(f'{prefix}_{self.target_language}.{ext.strip(".")}')
            File(save_on, content_file=y).save(**kwargs)
Пример #12
0
class FileBase(metaclass=ABCMeta):
    """
    High-level API for creating and manipulating files
    """
    __size_map = {"B": 1.e0, "KB": 1.e3, "MB": 1.e6, "GB": 1.e9, "TB": 1.e12}

    _new_line_sep_map = _NEW_LINE_SEP_MAP.copy()
    _str_new_line_sep_map = _STR_NEW_LINE_SEP_MAP.copy()
    _default_new_line_sep = DEFAULT_NEW_LINE_SEP
    _dont_read = [".pyc"]
    _ignore_dir = [".git"]
    _allowed_ext = ()
    _date_format = "%Y-%m-%d %H:%M:%S"
    _is_deleted = False

    def __init__(self,
                 path_: Union[str, Path],
                 content_file: Union[Sequence, str, Any] = None,
                 **kwargs):
        self._last_update = None
        self._is_byte = kwargs.get('is_byte', False)
        self._can_edit = not self._is_byte
        if isinstance(path_, Path):
            self.__path = path_
        else:
            self.__path = Path(normalize_path(path_))
        if self.__path.exists:
            assert not self.__path.is_dir, f'Path is a directory. Change your path. {self.__path}'
            self._last_update = self.updated_at
        if self._allowed_ext:
            assert self.ext in self._allowed_ext, ValueError(
                f'type of file {self.ext} not allowed. Only allowed {self._allowed_ext}'
            )
        if not content_file:
            content_file = []
        self._lines = self.normalize_data(content_file)
        if not self.is_empty:
            line_sep_ = self._default_new_line_sep
            self.set_new_line_sep(line_sep_)
        else:
            self._new_line_sep = self._default_new_line_sep
        self._current_change = 0
        self._max_history_length = 50
        self._change_history = []
        self._set_change('_lines', self._lines.copy())

    def __setattr__(self, key, value):
        object.__setattr__(self, key, value)
        if hasattr(self, '_change_history') and key not in (
                '_current_change', '_max_history_length', '_change_history'):
            self._set_change(key, object.__getattribute__(
                self, key))  # append last_value of attr

    def __sizeof__(self):
        return self.string.__sizeof__() - ''.__sizeof__(
        )  # subtracts the size of the python string object

    def __str__(self):
        return f'{self.__class__.__name__}<{self.file_name}>'

    def __repr__(self):
        return f'{self.__str__()}'

    def __getitem__(self, item) -> str:
        return self._lines[item]

    def __setitem__(self, key, value):
        if isinstance(key, Tuple):
            raise ValueError("invalid assignment.")
        self._insert(key, value)

    def __iter__(self):
        for i in self._lines:
            yield i

    def __len__(self):
        return self.__sizeof__()

    def _set_change(self, key, value):
        self._change_history = self._change_history[:self._current_change + 1]
        if len(self._change_history) >= self._max_history_length:
            self._change_history.pop(0)
        self._change_history.append((key, value))
        self._current_change = len(self._change_history)

    def _select_change(self, index):
        try:

            key, value = self._change_history[self._current_change + index]
            object.__setattr__(self, key, copy.copy(value))
            self._current_change += index
            logger.warning(
                f'You selected amendment {self._current_change + 1}')
        except IndexError:
            logger.info("It's not possible")

    def _save(self, encoding='utf-8', **kwargs):
        encoding = None if self._is_byte else encoding
        newline = None if self._is_byte else ''
        mode = 'w+b' if self._is_byte else 'w'
        content = self._lines[0] if self._is_byte else self.string
        with open(self.path,
                  mode=mode,
                  newline=newline,
                  encoding=encoding,
                  **kwargs) as fp:
            fp.write(content)
        self._last_update = self.updated_at

    @property
    def history(self):
        return self._change_history

    @property
    def data(self) -> Union[List[str], dict]:
        return self.lines

    @property
    def lines(self) -> List[str]:
        return self._lines.copy()

    @property
    def string(self) -> str:
        return f'{self._new_line_sep}'.join(self._lines)

    @property
    def content_str(self):
        warnings.warn(
            f"This property will be deprecated in future versions. "
            "you can use property `File.string`", DeprecationWarning, 2)
        return f'{self._new_line_sep}'.join(self._lines)

    @property
    def content_file(self) -> List[str]:
        warnings.warn(
            f"This property will be deprecated in future versions. "
            "you can use property `File.lines`", DeprecationWarning, 2)
        return self._lines

    @property
    def base64(self):
        return b64encode(self.string.encode())

    @property
    def path(self):
        return self.__path

    @property
    def file_name(self):
        return self.__path.name

    @property
    def file_name_without_ext(self):
        return self.__path.stem

    @property
    def n_lines(self):
        return len(self._lines)

    @property
    def is_empty(self):
        return not bool(self.n_lines)

    @property
    def dir_name(self):
        return self.__path.parent.name

    @property
    def dir_path(self):
        return self.__path.parent.path

    @property
    def is_link(self):
        return self.__path.is_link

    @property
    def ext(self):
        return self.__path.suffix

    @property
    def updated_at(self):
        return datetime.fromtimestamp(os.stat(str(
            self.path)).st_mtime).strftime(self._date_format)

    @property
    def created_at(self):
        return datetime.fromtimestamp(os.stat(str(
            self.path)).st_ctime).strftime(self._date_format)

    @property
    def last_access(self):
        return datetime.fromtimestamp(os.stat(str(
            self.path)).st_atime).strftime(self._date_format)

    @property
    def new_line_sep(self) -> str:
        return self._new_line_sep

    @property
    def new_line_sep_repr(self):
        return self._new_line_sep_map[self._new_line_sep]

    @classmethod
    def normalize_unix_line_sep(cls, content: str) -> str:
        return content.replace(cls._str_new_line_sep_map['CRLF'],
                               cls._default_new_line_sep).replace(
                                   cls._str_new_line_sep_map['CR'],
                                   cls._default_new_line_sep)

    @classmethod
    def normalize_data(cls, data: Any, *args,
                       **kwargs) -> Union[List[str], Any]:
        if not data:
            return data
        if is_iterable(data) or isinstance(data, int):
            if is_sequence(data) and not isinstance(data, int):
                data = [
                    str(line).replace(CRLF, '').replace(CR,
                                                        '').replace(LF, '')
                    for line in data
                ]
            elif isinstance(data, str):
                data = data.splitlines()
            elif isinstance(data, int):
                data = str(data)
            elif isinstance(data, bytes):
                data = [data]
            return data
        else:
            raise ValueError(f"{data} Invalid value. Send other ")

    @classmethod
    def parse_new_line_sep(cls, line: str) -> Union[str, None]:
        if is_iterable(line):
            for ln in cls._new_line_sep_map:
                if ln in line:
                    return ln
        try:
            if line in cls._str_new_line_sep_map:
                return cls._str_new_line_sep_map[line]
        except TypeError:
            return None
        return None

    @classmethod
    def load(cls, path_: Union[str, Path], **kwargs):
        """
        Read and create new file object.

        :param path_: File Path
        :param kwargs:
                encoding: utf-8 is default
                mode: r+ is default
                newline: '' is default
        :return: File object
        """
        path_ = Path(path_)
        assert path_.exists, FileNotFoundError('No such file', path_)
        encoding = kwargs.pop('encoding') if 'encoding' in kwargs else 'utf-8'
        mode = kwargs.pop('mode') if 'mode' in kwargs else 'r+'
        newline = kwargs.pop('newline') if 'newline' in kwargs else ''
        if path_.suffix in cls._dont_read:
            logger.warning(
                f"I can't read this file. See class attribute <{cls.__name__}._dont_read>"
            )
            return
        try:
            encoding = None if 'b' in mode else encoding
            newline = None if 'b' in mode else newline
            with open(path_,
                      mode=mode,
                      encoding=encoding,
                      newline=newline,
                      **kwargs) as fp:
                content = fp.read()
        except PermissionError as err:
            logger.error(err)
            return
        except UnicodeDecodeError:
            encoding = None
            newline = None
            mode = 'r+b'
            with open(path_,
                      mode=mode,
                      encoding=encoding,
                      newline=newline,
                      **kwargs) as fp:
                content = fp.read()
        return cls(path_, content, is_byte=True if 'b' in mode else False)

    @classmethod
    def load_files(cls,
                   path_,
                   ext,
                   contains_in_name: List = (),
                   not_contains_in_name=(),
                   take_empty=True,
                   recursive=False):
        path_ = Path.list_files(path_,
                                ext=ext,
                                contains_in_name=contains_in_name,
                                not_contains_in_name=not_contains_in_name,
                                recursive=recursive)
        loaded = []
        for p in path_:
            if recursive and p.is_dir:
                loaded.extend(cls.load_files(p, ext))
                continue
            if not p.exists or p.is_dir:
                continue
            file_ = cls.load(p)
            if file_ is None:
                continue
            if take_empty is True and file_.is_empty:
                continue
            if not (file_.ext == f'.{ext.strip(".")}'):
                continue
            if contains_in_name:
                if not any(
                        map(file_.file_name_without_ext.__contains__,
                            contains_in_name)):
                    continue
            if not_contains_in_name:
                if any(
                        map(file_.file_name_without_ext.__contains__,
                            not_contains_in_name)):
                    continue
            loaded.append(file_)
        return loaded

    @classmethod
    def walk(cls, root_dir: str) -> Iterator[Tuple[str, int, list]]:
        """
        It works in a similar way to os.walk. With the difference that the File instance returns.
        :param root_dir: Root directory you want to start browsing
        :return:
        """
        for dir_name, _, files in os.walk(root_dir):
            files_ = []
            if files:
                for file_name in files:
                    file_path = os.path.join(dir_name, file_name)
                    if not os.path.islink(file_path):
                        try:
                            file_obj = cls.load(file_path)
                            if file_obj is not None:
                                files_.append(file_obj)
                        except Exception as err:
                            logger.error(
                                f'Error reading the file {file_name}: {err}')
            yield os.path.basename(dir_name), len(files_), files_

    def set_new_line_sep(self, new_line_: str):
        self._new_line_sep = self.parse_new_line_sep(
            new_line_) or self._default_new_line_sep

    def undo(self):
        if self._current_change > 0:
            index = -2 if self._current_change == len(
                self._change_history) else -1
            self._select_change(index)

    def redo(self):
        if self._current_change < len(self._change_history):
            self._select_change(+1)

    def set_path(self, path_):
        self.__path = Path(path_)

    def size(self, unit: str = "KB"):
        """
        returns the size that the file occupies on the disk.

        :param unit: choose anyone in ('B', 'KB', 'MB', 'GB', 'TB')

        """
        assert isinstance(
            unit, str), f"expected {str.__name__} not {type(unit).__name__}."

        unit = unit.upper()

        assert unit in self.__size_map, f"{repr(unit)} is'nt valid. Choose anyone in {tuple(self.__size_map)}"

        return self.__sizeof__() / self.__size_map[unit]

    def _insert(self, line: int, data: Union[Sequence, str, int], **kwargs):
        assert self._can_edit, "can't edit file type."
        data = self.normalize_data(data, **kwargs)
        if is_sequence(data):
            if line == -1:
                self._lines += list(data)
                return
            for pos, i in enumerate(data, line):
                self._lines.insert(pos, i)
        if isinstance(data, str):
            if line == -1:
                self._lines.append(data)
                return
            self._lines.insert(line, data)
        self._set_change('_lines', self._lines.copy())

    def remove(self, line: Union[int, str]):
        self._lines.pop(line)
        self._set_change('_lines', self._lines.copy())

    def delete(self):
        self.__path.rm()
        self._is_deleted = True

    def save(self,
             on_new_path: Union[os.PathLike, None] = None,
             encoding='utf-8',
             exist_ok=False,
             overwrite=False,
             **kwargs):
        if (self._last_update is not None
                and overwrite is False) and not self._is_deleted:
            if self._last_update != self.updated_at:
                raise AssertionError(
                    f"File change detected (last change {self.updated_at}), if you want to overwrite "
                    f"set overwrite=True")
        assert exist_ok or not self.path.exists, FileExistsError(
            "File exists. If you want override, please send 'exist_ok=True'")
        if on_new_path is not None:
            self.set_path(on_new_path)
        self._save(encoding=encoding, **kwargs)
        return self

    def replace_file_sep(self, new, save: bool = True):
        new = self.parse_new_line_sep(new)
        if new is None:
            raise ValueError(f"{new} is'nt valid.")
        try:
            self.set_new_line_sep(new)
            if save is True:
                self._save(exist_ok=True)
        except UnicodeDecodeError:
            logger.error(f'Not possibility convert {self.file_name}')
        return self
Пример #13
0
 def set_path(self, path_):
     self.__path = Path(path_)