def load_repository_descriptions(path, summary=True, show_fork=False): filename = 'RepositorySummary.jsonl' if summary else 'Repository.jsonl' path = os.path.join(path, filename) if show_fork: valid = lambda x: True else: valid = lambda x: not x.get('fork') with JsonLineData(path) as data: return {item.get('id'): item.get('description') or '' for item in data if valid(item)}
def load_language_graph(path): path = os.path.join(path, 'Languages.jsonl') graph = nx.Graph() with JsonLineData(path) as data: for item in data: repo = Repository(item['repo']['id']) for k, v in item['languages'].iteritems(): language = Language(k) graph.add_edge(repo, language, weight=v) return graph
def _load_datasets(path, *filenames): dataset = defaultdict(dict) for filename in filenames: data_path = os.path.join(path, filename) with JsonLineData(data_path) as data: for item in data: id_ = item.get('id') if id_ is None: continue dataset[id_].update(item) return dataset
def load_language_co_occurrence(path): path = os.path.join(path, 'Languages.jsonl') graph = nx.Graph() with JsonLineData(path) as data: for item in data: languages = item['languages'] for k, v in languages.iteritems(): graph.add_node(k) graph.node[k]['size'] = graph.node[k].get('size', 0) + v graph.node[k]['occurrence'] = graph.node[k].get( 'occurrence', 0) + 1 for m, n in combinations(languages.keys(), 2): graph.add_edge(m, n) graph[m][n]['weight'] = graph[m][n].get('weight', 0) + 1 return graph
def load_follow_features(data_path, users): followers = defaultdict(set) followees = defaultdict(set) path = os.path.join(data_path, 'Follow.jsonl') with JsonLineData(path) as data: for item in data: s, t = item.get('follower'), item.get('followee') if s is None or t is None: continue i, j = s['id'], t['id'] followers[j].add(i) followees[i].add(j) vectorizer = DictVectorizer(sparse=True) followers = [dict.fromkeys(followers[u.id], 1) for u in users] followers = vectorizer.fit_transform(followers) followees = [dict.fromkeys(followees[u.id], 1) for u in users] followees = vectorizer.fit_transform(followees) return followers, followees
def load_graph(path, graph_type, item_filter=None): metadata = GRAPH_METADATA.get(graph_type) if not metadata: return path = os.path.join(path, metadata['filename']) if metadata['directed']: graph = nx.DiGraph() else: graph = nx.Graph() with JsonLineData(path) as data: if item_filter is not None: data = (x for x in data if item_filter(x)) for item in data: head = item.pop(metadata['head']['name'], None) tail = item.pop(metadata['tail']['name'], None) if head is None or tail is None: continue head = metadata['head']['class'](head['id']) tail = metadata['tail']['class'](tail['id']) graph.add_edge(tail, head, **item) return graph
def load_repository_languages(path): path = os.path.join(path, 'Languages.jsonl') with JsonLineData(path) as data: return {item['repo']['id']: item['languages'] for item in data}