This repository has been archived by the owner on Oct 3, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
models.py
392 lines (301 loc) · 11.6 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
"""This file contains the schema for the abandoned elaborated-repo sqlite
database. It is still used when choosing a new random sample."""
from base64 import b64encode, b64decode
import cPickle as pickle
import datetime
import json
import os
import msgpack
from peewee import SqliteDatabase, Model
from peewee import BooleanField, DateTimeField, IntegerField, TextField
from recordtype import recordtype
from config import config
from features import all_features as real_features
from features import _support_features
import utils
all_features = dict(real_features.items() + _support_features.items())
erepo_db = SqliteDatabase('erepo.db', threadlocals=True)
erepo_db.connect()
class _Serializable(object):
"""Mixin to support serialization of a custom class.
By default, recordtype._asdict is used."""
__slots__ = ()
@classmethod
def _pack(cls, obj):
return obj._asdict()
@classmethod
def _unpack(cls, data):
return cls(**data)
class _MsgpackMeta(type):
"""Set on a class to enable serialization with msgpack.
_Serializable becomes a base, so classes can override _un/pack.
Note that msgpack encodes Unicodes to utf8."""
def __new__(cls, name, bases, dct):
#Insert our methods.
dct['load'] = cls.load
dct['dump'] = cls.dump
bases = bases + (_Serializable,)
c = super(_MsgpackMeta, cls).__new__(cls, name, bases, dct)
#Subclasses get registered so we know how to pack/unpack them.
classes = getattr(_MsgpackMeta, '_reg_classes', set())
classes.add(c)
_MsgpackMeta._names = {"%s" % reg.__name__: reg for reg in classes}
_MsgpackMeta._reg_classes = classes
return c
#load/dump are the user interface - they can handle all registered classes
@classmethod
def load(cls, filepath=None):
"""Load the contents of the given filepath.
If None, assume '<current_snapshot>/repos.msgpack'"""
if filepath is None:
filepath = os.path.join(config['current_snapshot'], 'repos.msgpack')
with open(filepath, 'rb') as f:
records = msgpack.load(f, object_hook=cls._loader, use_list=False)
return records
@classmethod
def dump(cls, records, filepath=None):
"""Dump in the same fashion as load."""
if filepath is None:
filepath = os.path.join(config['current_snapshot'], 'repos.msgpack')
with utils.FaultTolerantFile(filepath) as f:
msgpack.dump(records, f, default=cls._dumper)
#behind the scenes, _loader and _dumper do the work
@classmethod
def _loader(cls, obj):
reg_class = cls._names.get(obj.get('__cls__'))
if reg_class:
return reg_class._unpack(obj['data'])
return obj
@classmethod
def _dumper(cls, obj):
if obj.__class__ in cls._reg_classes:
reg_cls = obj.__class__
return {
"__cls__": reg_cls.__name__,
'data': reg_cls._pack(obj)
}
return obj
_YMD = recordtype(
'YMD',
'year month day'
)
class YMD(_YMD):
"""Same purpose as datetime.date, but small and serializable."""
__metaclass__ = _MsgpackMeta
@staticmethod
def from_date(date):
"""Factory from datetime.date or datetime.datetime."""
return YMD(date.year, date.month, date.day)
_Author = recordtype('Author', (
# default to None
# fields from http://developer.github.com/v3/users/#get-a-single-user
[(key, None) for key in
['login', # "octocat "
'id', # 1
'avatar_url', # "https://github.com/images/error/octocat_happy.gif"
'gravatar_id', # "somehexcode"
'url', # "https://api.github.com/users/octocat"
'name', # "monalisa octocat"
'company', # "GitHub"
'blog', # "https://github.com/blog"
'location', # "San Francisco"
'email', # "octocat@github.com"
'hireable', # false
'bio', # "There once was..."
'public_repos', # 2
'public_gists', # 1
'followers', # 20
'following', # 0
'html_url', # "https://github.com/octocat"
'created_at', # "2008-01-14T04:33:35Z"
'type', # "User"
]
]
))
class Author(_Author):
"""Represents a GitHub user."""
__metaclass__ = _MsgpackMeta
_Repo = recordtype('Repo', (
['name', # in 'user/repo' format
'stars',
'fetch_ymd', # YMD of data acquisition
] +
# these are all GitHub apiv3 names:
['clone_url',
'created_at',
'description',
'fork',
'forks',
'git_url',
'has_downloads',
'has_issues',
'has_wiki',
'homepage',
'html_url',
'id',
'language',
'master_branch',
'open_issues',
'private',
'pushed_at',
'size',
'ssh_url',
'svn_url',
'updated_at',
'url',
] +
[(fname, None) for fname in all_features]
))
class Repo(_Repo):
"""A repo stores a snapshot of GitHub repo metadata retrieved from
`http://developer.github.com/v3/repos/#get` on some date, and any features
calculated on that repo's code/metadata.
"""
__metaclass__ = _MsgpackMeta
def __str__(self):
return self.name.encode('utf-8')
@classmethod
def _pack(cls, obj):
"""Don't write out support features."""
d = {k: v for (k, v) in obj._asdict().iteritems()
if k not in _support_features}
return d
def _calc(self, feature_name, overwrite=False):
"""Perform one-time calculation of a feature.
If overwrite is True, a currently memoized value will be overwritten.
"""
#even though __getattribute__ is cleaner, sometimes you do want the value
#without calculating (eg when writing out)
if feature_name not in all_features:
raise ValueError("%s is not a valid feature name" % feature_name)
val = getattr(self, feature_name)
if val is None or overwrite:
val = all_features[feature_name](self)
setattr(self, feature_name, val)
return val
def _clear_support_features(self):
"""Set all support features to None.
This can be used during calculation to limit memory use."""
for k in _support_features:
setattr(self, k, None)
@property
def username(self):
return self.name.split('/')[0]
@property
def reponame(self):
return self.name.split('/')[1]
@property
def creation_date(self):
return datetime.datetime(*[int(x) for x in self.created_at[:10].split('-')])
def calculate_features(self, features=None, overwrite=False):
"""Change to this repo's directory in the current snapshot,
then calculate the given features.
If features is None, calculate all features."""
if features is None:
features = all_features.keys()
code_dir = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
config['current_snapshot'], 'code', self.username, self.reponame
)
with utils.cd(code_dir):
for f in features:
self._calc(f, overwrite)
@classmethod
def load_sample(cls, sample_path=None, separate=False):
"""Load only repos in the given sample.
If sample_path is None, assume '<current_snapshot>/<current_sample>'.
If separate is true, return a dict mapping class name to repo lists."""
repos = cls.load()
# memoize for probable write_update
cls._last_loaded = repos
if sample_path is None:
sample_path = os.path.join(config['current_snapshot'], config['current_sample'])
with open(sample_path, 'rb') as f:
separated_names = json.load(f)
assert isinstance(separated_names, dict), 'did you try to load an old flat sample?'
if separate:
return {clsname: [r for r in repos if r.name in set(names)]
for (clsname, names) in separated_names.items()}
else:
nameset = set()
for names in separated_names.values():
nameset.update(set(names))
return [r for r in repos if r.name in nameset]
@classmethod
def write_update(cls, records, filepath=None):
"""Like dump, but updates repos with a duplicate name."""
loaded = getattr(cls, '_last_loaded', None)
if loaded is None:
loaded = cls.load(filepath)
cur_repos = {r.name: r for r in loaded}
new_repos = {r.name: r for r in records}
cur_repos.update(new_repos)
cls.dump(cur_repos.values(), filepath)
cls._last_loaded = None
@staticmethod
def from_erepo(erepo):
grepo_to_erepo = {'name': '_user_repo',
'stars': 'watchers'}
grepo_to_erepo.update({f: f for f in erepo.__dict__['_data']
if f not in ('_user_repo', '_stars', 'watchers', '_elaborated_at',
'_elaborated', '_error', '_features',
'_flagged')})
kwargs = {g_f: getattr(erepo, e_f) for (g_f, e_f) in grepo_to_erepo.items()}
kwargs['fetch_ymd'] = YMD.from_date(erepo._elaborated_at)
return Repo(**kwargs)
class ERepoModel(Model):
class Meta:
database = erepo_db
class ERepo(ERepoModel):
#start initially populated fields
#_form to avoid name collisions with github fields
_user_repo = TextField(primary_key=True) # eg simon/awesome-repo.
_stars = IntegerField(null=True) # no longer used as of 2013/03, use watchers instead
_elaborated = BooleanField(default=False) # ie 'has been processed'
_error = BooleanField(default=False) # eg download failed
#base64 encoded python pickle of a dict
_features = TextField(default='KGRwMAou') # default=b64(pickle.dumps({}))
_flagged = BooleanField(default=False) # ie 'need to process'
#end init populated fields
#all others are set at elaboration-time
_elaborated_at = DateTimeField(null=True)
#start github api3 names
clone_url = TextField(null=True)
created_at = DateTimeField(null=True)
description = TextField(null=True)
fork = BooleanField(null=True)
forks = IntegerField(null=True)
git_url = TextField(null=True)
has_downloads = BooleanField(null=True)
has_issues = BooleanField(null=True)
has_wiki = BooleanField(null=True)
homepage = TextField(null=True)
html_url = TextField(null=True)
id = IntegerField(null=True)
language = TextField(null=True)
master_branch = TextField(null=True)
open_issues = IntegerField(null=True)
private = BooleanField(null=True)
pushed_at = DateTimeField(null=True)
size = IntegerField(null=True)
ssh_url = TextField(null=True)
svn_url = TextField(null=True)
updated_at = DateTimeField(null=True)
url = TextField(null=True)
watchers = IntegerField(null=True) # added before 2013/03 elaboration
#to handle feature encoding/decoding.
def get_features(self):
return pickle.loads(b64decode(self._features))
def set_features(self, val):
self._features = b64encode(pickle.dumps(val))
def get_vis(self):
"""Return a pretty-printed string to visualize the data."""
lines = []
lines.append("%s" % self._user_repo)
for k in sorted(self._data.iterkeys()):
lines.append(" %s: %s" % (k, self._data[k]))
res = '\n'.join(lines)
return res.encode('utf-8')
def __repr__(self):
quoted = "'%s'" % self._user_repo
return quoted.encode('utf-8')