Exemplo n.º 1
0
def get_data():
    files = list()
    post_data_dir = "data-java-post"
    if os.path.exists(post_data_dir):
        shutil.rmtree(post_data_dir)
        os.mkdir(post_data_dir)
    else:
        os.mkdir(post_data_dir)
    for i, file in enumerate(
            list(just.multi_read("data-java/**/*.txt").values())):
        new_file = os.path.join(post_data_dir, "j%s.txt" % i)
        new_line = ''
        with open(new_file, 'w') as post:
            for line in file.split('\n'):
                if line.strip().startswith('@') or \
                        line.strip().startswith('/') or \
                        line.strip().startswith('import') or \
                        line.strip().startswith('package') or \
                        line.strip().startswith('*'):
                    line = line.replace(line, '')
                line = re.sub(r'(class) (\S+)', r'\1 ^C^', line)
                line = re.sub(
                    r"if.?\(([a-zA-Z\.\(\)\!0-9]+).[\)|=|<|>|!|&|\|]?",
                    'if (^E^ ', line)
                line = re.sub(r"(static final int) ([A-Z0-9_]+)", r"\1 ^P^",
                              line)
                line = re.sub(r"(static final String) ([A-Z0-9_]+)", r"\1 ^P^",
                              line)

                # if m is not None:
                #     print(m.group(1))
                new_line += line + '\n'
            post.write(re.sub(r'\n\s*\n', '\n', new_line))
Exemplo n.º 2
0
    def load(cls, nrows=None):
        file_glob = "~/nostalgia_data/input/fitbit/*/sleep/*.json"
        objects = []
        for d in just.multi_read(file_glob).values():
            if not d:
                continue
            for x in d:
                data = pd.DataFrame(x["levels"]["data"] +
                                    [{
                                        'dateTime': x['endTime'],
                                        'level': None,
                                        'seconds': None
                                    }])
                data["dateTime"] = [
                    datetime_from_format(x, "%Y-%m-%dT%H:%M:%S.%f")
                    for x in data.dateTime
                ]
                start = data.dateTime.iloc[:-1]
                end = data.dateTime.iloc[1:]
                interval_index = pd.IntervalIndex.from_arrays(start, end)
                data = pd.DataFrame(data.iloc[:-1])
                data = data.set_index(interval_index)
                data["start"] = data.index.left
                data["end"] = data.index.right
                objects.append(data)
                if nrows is not None and data.shape[0] > nrows:
                    break

        data = pd.concat(objects).drop("dateTime", axis=1)

        return cls(data)
Exemplo n.º 3
0
def test_multi_read():
    obj = ["a", "b"]
    fnames = ["a.txt", "b.txt"]
    just.multi_write(obj, fnames)
    try:
        for name, data in just.multi_read("*.txt"):
            assert fnames.index(name.split("/")[-1]) == obj.index(data)
    finally:
        for fname in fnames:
            os.remove(fname)
Exemplo n.º 4
0
def test_multi_read():
    obj = ["a", "b"]
    fnames = ["a.txt", "b.txt"]
    just.multi_write(obj, fnames)
    try:
        full_names = just.glob("*.txt")
        multi_content = just.multi_read("*.txt")
        for o, f in zip(obj, fnames):
            full_name = [x for x in full_names if x.endswith(f)][0]
            assert multi_content[full_name] == o
    finally:
        for fname in fnames:
            os.remove(fname)
Exemplo n.º 5
0
 def load(cls, nrows=None):
     files = "~/nostalgia_data/input/spotify/StreamingHistory*.json"
     spotify = pd.DataFrame(
         [
             (
                 datetime_from_format(x["endTime"], "%Y-%m-%d %H:%M") - timedelta(milliseconds=x["msPlayed"]),
                 datetime_from_format(x["endTime"], "%Y-%m-%d %H:%M"),
                 x["trackName"],
                 x["artistName"],
                 x["msPlayed"] / 1000
             )
             for x in flatten(just.multi_read(files).values())
         ],
         columns=["time_start", "time_end", "title", "artist", "seconds"],
     )
     return cls(spotify)
Exemplo n.º 6
0
def get_data():
    return list(just.multi_read("data/**/*.py").values())
Exemplo n.º 7
0
from multiprocessing import Pool

import just
from auto_extract import parse_article

pool = Pool(4)

# extracts = pool.map(extruct.extract, file_contents)

data = just.multi_read("~/.nostalgia_chrome/html/*.json")
file_names, file_contents = data.keys(), data.values()


def extract_and_save(args):
    file_name, file_content = args
    url = file_content["url"]
    html = file_content["html"]
    parsed = parse_article(html, url)
    just.write(parsed,
               "~/.nostalgia_chrome/metadata/" + file_name.split("/")[-1])


zz = [extract_and_save(x) for x in zip(file_names, file_contents)]

z = pool.map(extract_and_save, zip(file_names, file_contents))


def recurser(obj, contain_str, container, parent=None):
    if isinstance(obj, dict):
        for k, v in obj.items():
            if contain_str in k:
Exemplo n.º 8
0
def get_data():
    return list(just.multi_read("data-java-post/**/*.txt").values())