def get_data(): files = list() post_data_dir = "data-java-post" if os.path.exists(post_data_dir): shutil.rmtree(post_data_dir) os.mkdir(post_data_dir) else: os.mkdir(post_data_dir) for i, file in enumerate( list(just.multi_read("data-java/**/*.txt").values())): new_file = os.path.join(post_data_dir, "j%s.txt" % i) new_line = '' with open(new_file, 'w') as post: for line in file.split('\n'): if line.strip().startswith('@') or \ line.strip().startswith('/') or \ line.strip().startswith('import') or \ line.strip().startswith('package') or \ line.strip().startswith('*'): line = line.replace(line, '') line = re.sub(r'(class) (\S+)', r'\1 ^C^', line) line = re.sub( r"if.?\(([a-zA-Z\.\(\)\!0-9]+).[\)|=|<|>|!|&|\|]?", 'if (^E^ ', line) line = re.sub(r"(static final int) ([A-Z0-9_]+)", r"\1 ^P^", line) line = re.sub(r"(static final String) ([A-Z0-9_]+)", r"\1 ^P^", line) # if m is not None: # print(m.group(1)) new_line += line + '\n' post.write(re.sub(r'\n\s*\n', '\n', new_line))
def load(cls, nrows=None): file_glob = "~/nostalgia_data/input/fitbit/*/sleep/*.json" objects = [] for d in just.multi_read(file_glob).values(): if not d: continue for x in d: data = pd.DataFrame(x["levels"]["data"] + [{ 'dateTime': x['endTime'], 'level': None, 'seconds': None }]) data["dateTime"] = [ datetime_from_format(x, "%Y-%m-%dT%H:%M:%S.%f") for x in data.dateTime ] start = data.dateTime.iloc[:-1] end = data.dateTime.iloc[1:] interval_index = pd.IntervalIndex.from_arrays(start, end) data = pd.DataFrame(data.iloc[:-1]) data = data.set_index(interval_index) data["start"] = data.index.left data["end"] = data.index.right objects.append(data) if nrows is not None and data.shape[0] > nrows: break data = pd.concat(objects).drop("dateTime", axis=1) return cls(data)
def test_multi_read(): obj = ["a", "b"] fnames = ["a.txt", "b.txt"] just.multi_write(obj, fnames) try: for name, data in just.multi_read("*.txt"): assert fnames.index(name.split("/")[-1]) == obj.index(data) finally: for fname in fnames: os.remove(fname)
def test_multi_read(): obj = ["a", "b"] fnames = ["a.txt", "b.txt"] just.multi_write(obj, fnames) try: full_names = just.glob("*.txt") multi_content = just.multi_read("*.txt") for o, f in zip(obj, fnames): full_name = [x for x in full_names if x.endswith(f)][0] assert multi_content[full_name] == o finally: for fname in fnames: os.remove(fname)
def load(cls, nrows=None): files = "~/nostalgia_data/input/spotify/StreamingHistory*.json" spotify = pd.DataFrame( [ ( datetime_from_format(x["endTime"], "%Y-%m-%d %H:%M") - timedelta(milliseconds=x["msPlayed"]), datetime_from_format(x["endTime"], "%Y-%m-%d %H:%M"), x["trackName"], x["artistName"], x["msPlayed"] / 1000 ) for x in flatten(just.multi_read(files).values()) ], columns=["time_start", "time_end", "title", "artist", "seconds"], ) return cls(spotify)
def get_data(): return list(just.multi_read("data/**/*.py").values())
from multiprocessing import Pool import just from auto_extract import parse_article pool = Pool(4) # extracts = pool.map(extruct.extract, file_contents) data = just.multi_read("~/.nostalgia_chrome/html/*.json") file_names, file_contents = data.keys(), data.values() def extract_and_save(args): file_name, file_content = args url = file_content["url"] html = file_content["html"] parsed = parse_article(html, url) just.write(parsed, "~/.nostalgia_chrome/metadata/" + file_name.split("/")[-1]) zz = [extract_and_save(x) for x in zip(file_names, file_contents)] z = pool.map(extract_and_save, zip(file_names, file_contents)) def recurser(obj, contain_str, container, parent=None): if isinstance(obj, dict): for k, v in obj.items(): if contain_str in k:
def get_data(): return list(just.multi_read("data-java-post/**/*.txt").values())