-
Notifications
You must be signed in to change notification settings - Fork 1
/
util.py
153 lines (124 loc) · 4.02 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import codecs
import ujson as json
import math
import gensim
import collections
import functools
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict
def load_items_by_line(path):
with codecs.open(path, 'r', 'utf8') as f:
items = set([l.strip()
for l in f])
return items
def load_json_by_line(path):
return map(json.loads, load_items_by_line(path))
def load_id2obj_dict(path, id_key):
try:
df = pd.read_json(path)
except (ValueError, IOError):
df = pd.read_pickle(path)
d = defaultdict(lambda: {'id': 'unknown', 'name': 'unknown',
'subject': '', 'body': ''})
for _, r in df.iterrows():
d[r[id_key]] = r.to_dict()
return d
def get_datetime(obj):
if isinstance(obj, datetime):
return obj
elif (isinstance(obj, float) or
isinstance(obj, int)) and not math.isnan(obj):
return datetime.fromtimestamp(obj)
elif isinstance(obj, long):
return datetime.fromtimestamp(obj / 1000)
elif isinstance(obj, basestring):
patterns = ['%Y-%m-%d %X.%f', '%Y-%m-%d %X']
ok = False
for p in patterns:
try:
dt = datetime.strptime(
obj, p
)
ok = True
except ValueError:
continue
if ok:
return dt
else:
raise ValueError('Bad datetime format for {}'.format(patterns))
else:
raise TypeError('Unacceptable type {}, {}'.format(type(obj), obj))
def compose(*functions):
def inner(arg):
for f in functions:
arg = f(arg)
return arg
return inner
def json_dump(obj, path):
with codecs.open(path, 'w', 'utf8') as f:
f.write(json.dumps(obj))
def json_load(path):
with codecs.open(path, 'r', 'utf8') as f:
return json.load(f)
def load_summary_related_data(interactions_path, people_path,
corpus_dict_path, lda_model_path):
try:
interactions = json.load(open(interactions_path))
except ValueError:
interactions = pd.read_pickle(interactions_path)
try:
people_info = json.load(open(people_path))
except ValueError:
people_info = pd.read_pickle(people_path).to_dict(orient='records')
dictionary = gensim.corpora.dictionary.Dictionary.load(
corpus_dict_path
)
# lda = gensim.models.ldamodel.LdaModel.load(
# lda_model_path
# )
lda = gensim.models.wrappers.LdaMallet.load(lda_model_path)
return interactions, people_info, dictionary, lda
class memoized(object):
"""
Decorator. Caches a function's return value each time it is called.
If called later with the same arguments, the cached value is returned
(not reevaluated).
"""
def __init__(self, func):
self.func = func
self.cache = {}
def __call__(self, *args):
# print(args)
if not isinstance(args, collections.Hashable):
# uncacheable. a list, for instance.
# better to not cache than blow up.
return self.func(*args)
if args in self.cache:
# print('cache hit')
return self.cache[args]
else:
# print('cache miss')
value = self.func(*args)
self.cache[args] = value
# print('saving result')
return value
def __repr__(self):
"""Return the function's docstring.
"""
return self.func.__doc__
def __get__(self, obj, objtype):
"""Support instance methods.
"""
return functools.partial(self.__call__, obj)
def format_timestamp(s, format='%Y-%m-%d'):
return datetime.fromtimestamp(s).strftime(format)
def smart_read_df(path):
if path.endswith('.json'):
return pd.read_json(path)
else:
return pd.read_pickle(path)
def parse_time_delta(s):
number, unit = s.split('-')
number = int(number)
return timedelta(**{unit: number})