示例#1
0
def load_repository_descriptions(path, summary=True, show_fork=False):
    filename = 'RepositorySummary.jsonl' if summary else 'Repository.jsonl'
    path = os.path.join(path, filename)
    if show_fork:
        valid = lambda x: True
    else:
        valid = lambda x: not x.get('fork')
    with JsonLineData(path) as data:
        return {item.get('id'): item.get('description') or '' for item in data if valid(item)}
示例#2
0
def load_language_graph(path):
    path = os.path.join(path, 'Languages.jsonl')
    graph = nx.Graph()
    with JsonLineData(path) as data:
        for item in data:
            repo = Repository(item['repo']['id'])
            for k, v in item['languages'].iteritems():
                language = Language(k)
                graph.add_edge(repo, language, weight=v)
    return graph
示例#3
0
def _load_datasets(path, *filenames):
    dataset = defaultdict(dict)
    for filename in filenames:
        data_path = os.path.join(path, filename)
        with JsonLineData(data_path) as data:
            for item in data:
                id_ = item.get('id')
                if id_ is None:
                    continue
                dataset[id_].update(item)
    return dataset
示例#4
0
def load_language_co_occurrence(path):
    path = os.path.join(path, 'Languages.jsonl')
    graph = nx.Graph()
    with JsonLineData(path) as data:
        for item in data:
            languages = item['languages']
            for k, v in languages.iteritems():
                graph.add_node(k)
                graph.node[k]['size'] = graph.node[k].get('size', 0) + v
                graph.node[k]['occurrence'] = graph.node[k].get(
                    'occurrence', 0) + 1
            for m, n in combinations(languages.keys(), 2):
                graph.add_edge(m, n)
                graph[m][n]['weight'] = graph[m][n].get('weight', 0) + 1
    return graph
示例#5
0
def load_follow_features(data_path, users):
    followers = defaultdict(set)
    followees = defaultdict(set)
    path = os.path.join(data_path, 'Follow.jsonl')
    with JsonLineData(path) as data:
        for item in data:
            s, t = item.get('follower'), item.get('followee')
            if s is None or t is None:
                continue
            i, j = s['id'], t['id']
            followers[j].add(i)
            followees[i].add(j)
    vectorizer = DictVectorizer(sparse=True)
    followers = [dict.fromkeys(followers[u.id], 1) for u in users]
    followers = vectorizer.fit_transform(followers)
    followees = [dict.fromkeys(followees[u.id], 1) for u in users]
    followees = vectorizer.fit_transform(followees)
    return followers, followees
示例#6
0
def load_graph(path, graph_type, item_filter=None):
    metadata = GRAPH_METADATA.get(graph_type)
    if not metadata:
        return
    path = os.path.join(path, metadata['filename'])
    if metadata['directed']:
        graph = nx.DiGraph()
    else:
        graph = nx.Graph()
    with JsonLineData(path) as data:
        if item_filter is not None:
            data = (x for x in data if item_filter(x))
        for item in data:
            head = item.pop(metadata['head']['name'], None)
            tail = item.pop(metadata['tail']['name'], None)
            if head is None or tail is None:
                continue
            head = metadata['head']['class'](head['id'])
            tail = metadata['tail']['class'](tail['id'])
            graph.add_edge(tail, head, **item)
    return graph
示例#7
0
def load_repository_languages(path):
    path = os.path.join(path, 'Languages.jsonl')
    with JsonLineData(path) as data:
        return {item['repo']['id']: item['languages'] for item in data}