Пример #1
0
 def load_dataframe_per_json_file(cls, glob_pattern, key="", nrows=None):
     fnames = set(just.glob(glob_pattern))
     name = glob_pattern + "_" + normalize_name(cls.__name__)
     processed_files = get_processed_files(name)
     to_process = fnames.difference(processed_files)
     objects = []
     if nrows is not None:
         if not to_process:
             to_process = list(processed_files)[-nrows:]
         else:
             to_process = list(to_process)[-nrows:]
     if to_process:
         print("processing {} files".format(len(to_process)))
         for fname in to_process:
             data = read_array_of_dict_from_json(fname, key, nrows)
             data = cls.handle_dataframe_per_file(data, fname)
             if data is None:
                 continue
             objects.append(data)
         data = pd.concat(objects)
         if processed_files and nrows is None:
             data = pd.concat((data, load_df(name)))
         for x in ["time", "start", "end"]:
             if x in data:
                 data = data.sort_values(x)
                 break
         if nrows is None:
             save_df(data, name)
             save_processed_files(fnames | processed_files, name)
     else:
         data = load_df(name)
     if nrows is not None:
         data = data.iloc[-nrows:]
     return data
Пример #2
0
    def load_image_texts(cls, glob_pattern_s, nrows=None):
        import pytesseract
        from PIL import Image

        if isinstance(glob_pattern_s, list):
            fnames = set()
            for glob_pattern in glob_pattern_s:
                fnames.update(set(just.glob(glob_pattern)))
            glob_pattern = "_".join(glob_pattern_s)
        else:
            fnames = set(just.glob(glob_pattern))
        name = glob_pattern + "_" + normalize_name(cls.__name__)
        processed_files = get_processed_files(name)
        to_process = fnames.difference(processed_files)
        objects = []

        cache = get_cache("tesseract")

        if nrows is not None:
            if not to_process:
                return load_df(name).iloc[-nrows:]
            else:
                to_process = list(to_process)[-nrows:]
        if to_process:
            for fname in to_process:
                if fname in cache:
                    text = cache[fname]
                else:
                    try:
                        text = pytesseract.image_to_string(
                            Image.open(just.make_path(fname)))
                    except OSError as e:
                        print("ERR", fname, e)
                        continue
                    cache[fname] = text
                time = datetime_from_timestamp(os.path.getmtime(fname), "utc")
                data = {
                    "text": text,
                    "path": fname,
                    "title": fname.split("/")[-1],
                    "time": time
                }
                objects.append(data)
            data = pd.DataFrame(objects)
            if processed_files and nrows is None:
                data = pd.concat((data, load_df(name)))
            for x in ["time", "start", "end"]:
                if x in data:
                    data = data.sort_values(x)
                    break
            if nrows is None:
                save_df(data, name)
                save_processed_files(fnames | processed_files, name)
        else:
            data = load_df(name)
        if nrows is not None:
            data = data.iloc[-nrows:]
        return data
Пример #3
0
 def load_object_per_newline(cls, fname, nrows=None):
     """
     Iterates over a file containing an object per line (e.g. .jsonl or .txt).
     Will only handle new lines not seen earlier; it detects this by storing the number-of-objects seen.
     You should implement `object_to_row(cls, row)` on your class that returns a dictionary.
     """
     data = []
     name = fname + "_" + normalize_name(cls.__name__)
     newline_count = get_newline_count(name)
     for i, x in enumerate(just.iread(fname)):
         if nrows is None:
             if i < newline_count:
                 continue
         row = cls.object_to_row(x)
         if row is None:
             continue
         data.append(row)
         # breaking at approx 5 rows
         if nrows is not None and i > nrows:
             break
     if data:
         data = pd.DataFrame(data)
         if newline_count and nrows is None:
             data = pd.concat((data, load_df(name)))
         if nrows is None:
             data = save_df(data, name)
             n = i + 1
             save_newline_count(n, name)
     else:
         data = load_df(name)
     if nrows is not None:
         data = data.iloc[-nrows:]
     return data
Пример #4
0
    def load_data_file_modified_time(cls,
                                     fname,
                                     key_name="",
                                     nrows=None,
                                     from_cache=True,
                                     **kwargs):
        """
        It will load from cache if filename is not changed since last run (and there is a cache).
        If it has changed, it will reprocess and save it in cache (including the modified_time).
        Handles csv, mbox and json currently.
        key_name is only for json.
        nrows is for enabling quickly loading a sample.
        from_cache=False allows ignoring the cache and reprocessing the file.

        Loading the csv, json or mbox file will yield you a DF
        IMPORTANT: assumes you implement `handle_dataframe_per_file`
        This is the post-processing required after the file is loaded, for e.g. converting time
        dropping and adding columns.
        """
        name = fname + "_" + normalize_name(cls.__name__)
        modified_time = os.path.getmtime(os.path.expanduser(fname))
        last_modified = get_last_mod_time(name)
        if modified_time != last_modified or not from_cache:
            if fname.endswith(".csv"):
                data = pd.read_csv(fname,
                                   error_bad_lines=False,
                                   nrows=nrows,
                                   **kwargs)
            elif fname.endswith(".mbox"):
                import mailbox

                m = mailbox.mbox(fname)
                data = pd.DataFrame(
                    [{l: x[l]
                      for l in ["from", "to", "date", "subject"]} for x in m])
            else:
                data = read_array_of_dict_from_json(fname, key_name, nrows,
                                                    **kwargs)
            data = cls.handle_dataframe_per_file(data, fname)
            if nrows is None:
                save_df(data, name)
                save_last_mod_time(modified_time, name)
        else:
            data = load_df(name, nrows)
        if nrows is not None:
            data = data.iloc[-nrows:]
        return data
Пример #5
0
 def load_json_file_modified_time(cls,
                                  fname,
                                  nrows=None,
                                  from_cache=True,
                                  **kwargs):
     name = fname + "_" + normalize_name(cls.__name__)
     modified_time = os.path.getmtime(os.path.expanduser(fname))
     last_modified = get_last_mod_time(name)
     if modified_time != last_modified or not from_cache:
         data = just.read(fname)
         data = cls.handle_json(data, **kwargs)
         data = pd.DataFrame(data)
         if nrows is None:
             save_df(data, name)
             save_last_mod_time(modified_time, name)
     else:
         data = load_df(name)
     if nrows is not None:
         data = data.iloc[-nrows:]
     return data
Пример #6
0
 def save_df(cls, df, name=None):
     return save_df(df, name or cls.get_normalized_name())