def check_property_null(repo_path): null_property_dic = { "hasDependencies": set(), "packageManager": set(), "packageName": set(), "repository": set(), "requirements": set(), } for repo_index, repo_file in enumerate(os.listdir(repo_path)): if os.path.splitext(repo_file)[1] != ".json": continue json = read_json_file(os.path.join(repo_path, repo_file)) try: dependencyGraphManifest_nodes = json["data"]["repository"][ "dependencyGraphManifests"]["nodes"] for node in dependencyGraphManifest_nodes: dependency_nodes = node["dependencies"]["nodes"] for node2 in dependency_nodes: for key, val in node2.items(): if val is None: null_property_dic[key].add( json["data"]["repository"]["nameWithOwner"]) except Exception as e: print(e) print("exception at: " + repo_path + repo_file) return null_property_dic
def get_arxivId_paperTitle_dic(self): json = read_json_file( r"C:\Disk_Dev\Repository\github-KG\github-KG-python\tx_data\resource\paperswithcode\papers-with-abstracts.json" ) res = {} for item in json: res[item["arxiv_id"]] = item["title"] return res
def load_repo_dir_jsons(repo_dir_path): res = OrderedDict() for repo_index, repo_file in enumerate(os.listdir(repo_dir_path)): if os.path.splitext(repo_file)[1] != ".json": continue json = read_json_file(repo_dir_path + "/" + repo_file) res[repo_file] = json return res
def load_pwc_json(dir_path): evaluation_tables = read_json_file( os.path.join(dir_path, 'evaluation-tables.json')) links_between_papers_and_code = read_json_file( os.path.join(dir_path, 'links-between-papers-and-code.json')) methods = read_json_file(os.path.join(dir_path, 'methods.json')) papers_with_abstracts = read_json_file( os.path.join(dir_path, 'papers-with-abstracts.json')) datasets = None if os.path.exists(os.path.join(dir_path, 'datasets.json')): datasets = read_json_file(os.path.join(dir_path, 'datasets.json')) return { "evaluation-tables": evaluation_tables, "links_between_papers_and_code": links_between_papers_and_code, "methods": methods, "papers_with_abstracts": papers_with_abstracts, 'datasets': datasets }
def get_exist_repo_list_by_info(repo_dir_path): res_list = [] for repo_index, repo_file in enumerate(os.listdir(repo_dir_path)): ext = os.path.splitext(repo_file)[1] if ext == ".json" or ext == '.md': json_dic = read_json_file(os.path.join(repo_dir_path, repo_file)) nameWithOwner = jsonpath.jsonpath( json_dic, "$.data.repository.nameWithOwner")[0] res_list.append(nameWithOwner) return res_list
def split_portrait_train_test_by_count(kwargs): # 获取要服务的用户 df_to_reco_csv = pd.read_csv(kwargs.get('to_reco_csv_file')) relation_row_list = df_to_reco_csv.iloc[:, 0:2].values.tolist() payload_repo_dic = defaultdict(list) for row in relation_row_list: payload_repo_dic[row[0]].append(row[1]) payload_repo_set = payload_repo_dic.keys() # 读取用户画像信息(实验用) total_repo_portrait_json_dic = read_json_file( kwargs.get('repo_portrait_json_file')) # 保存训练集和测试集,便于后面计算指标 repo_portrait_train_test_dic = defaultdict(dict) for i, nameWithOwner in enumerate(payload_repo_set): # 划分训练集测试集,固定选择个数法 nameWithOwner_portrait_info = total_repo_portrait_json_dic[ nameWithOwner] repo_portrait_train_test_dic[nameWithOwner] = defaultdict(dict) repo_portrait_train_test_dic[nameWithOwner]['train'] = defaultdict( list) for entity in kwargs.get('total_entities'): # 画像数据中不存在该实体数据默认为空列表 entity_list = nameWithOwner_portrait_info.get(entity, []) # 希望初始化 key:[],不然list为空,不能用defaultdict(list)因为默认为空 repo_portrait_train_test_dic[nameWithOwner]['train'][entity] = [] repo_portrait_train_test_dic[nameWithOwner]['test'][entity] = [] # 如果该实体不需要作为训练集,则全当做测试集,否则划分之 if entity not in kwargs.get('payload_entities'): repo_portrait_train_test_dic[nameWithOwner]['test'][ entity] = entity_list continue # 随机选 init_input_count 个对象作为训练集,不足返回空列表 # 这里target_obj_list的来源是固定repo_portrait_json文件,所以也可以不sorted train_entity_list, test_entity_list = split_list_by_sub_count( sorted(entity_list), kwargs.get('init_input_count'), kwargs.get('split_seed')) # 测试集保留list就行了 repo_portrait_train_test_dic[nameWithOwner]['test'][ entity] = test_entity_list # 训练集用dic_list,保存target的一些属性,比如tf entity_portrait_dic_list = [] for entityKey in train_entity_list: train_target_dic = { 'key': entityKey, 'tf': 1 / len(train_entity_list) } entity_portrait_dic_list.append(train_target_dic) repo_portrait_train_test_dic[nameWithOwner]['train'][ entity] = entity_portrait_dic_list # dump 训练集测试集 # to_reco_csv_file_name = kwargs.get('to_reco_csv_file').split('\\')[-1:][0] # out_dir = os.path.join(kwargs.get('out_dir'), os.path.splitext(to_reco_csv_file_name)[0], 'init_' + str(kwargs.get('init_input_count'))) # dump_repo_train_test_path = os.path.join(out_dir, 'repo_portrait_init_train_test' + '_' + str(uuid1()) + '.json') # write_json_file(out_dir, dump_repo_train_test_path, repo_portrait_train_test_dic) return repo_portrait_train_test_dic
def get_data_one_topic_repo(topic_path): res = set() for page_index, page_file in enumerate(os.listdir(topic_path)): json = read_json_file(os.path.join(topic_path, page_file)) # 预处理,忽略fork的仓库 private仓库 for item in json["items"]: exclude = item["size"] == 0 or item["fork"] or item["private"] if exclude is True: continue res.add(item["full_name"]) return res
def handle_raw_json(): json_dic = read_json_file( r"C:\Disk_Data\Small_Data\Neo4j\tensorflow-$-tensorflow.json") language_edges = jsonpath.jsonpath(json_dic, "$.data.repository.languages.edges[*]") language_nodes = jsonpath.jsonpath(json_dic, "$.data.repository.languages.nodes[*]") for i in range(len(language_nodes)): language_nodes[i]["size"] = language_edges[i]["size"] json_dic["data"]["repository"]["languages"]["nodes"] = language_nodes json_str = json.dumps(json_dic) pass
def check_errors_repos(repo_path): error_repo_dic = {} # error_repo_list = [] for repo_index, repo_file in enumerate(os.listdir(repo_path)): if os.path.splitext(repo_file)[1] != ".json": continue json_dic = read_json_file(os.path.join(repo_path, repo_file)) errors = jsonpath.jsonpath(json_dic, "$.errors") if errors is not False: error_repo_dic[repo_file] = errors # error_repo_list.append(repo_file) # return error_repo_list return error_repo_dic
def get_needless_repo_set(repo_dir, raw_file_list, min_dependency_count): ''' 1、不同名的仓库,内容完全一样,相当于就换了个名字,对于这种重复只保留一份repo 2、没有依赖的repo,这样的独立节点对推荐没用 :param min_dependency_count: :param repo_dir: :return: ''' res_dic = defaultdict(list) dup_dic = defaultdict(list) no_dup_nwo_set = set() # res_set = set() dup_set = set() no_dependency_set = set() less_dependency_set = set() for i, repo_file in enumerate(raw_file_list): if os.path.splitext(repo_file)[1] != ".json": continue json_dic = read_json_file(os.path.join(repo_dir, repo_file)) nameWithOwner = jsonpath.jsonpath(json_dic, "$.data.repository.nameWithOwner")[0] if nameWithOwner in no_dup_nwo_set: dup_set.add(repo_file) continue no_dup_nwo_set.add(nameWithOwner) # 过滤没有依赖的repo dgm_count = jsonpath.jsonpath( json_dic, "$.data.repository.dependencyGraphManifests.totalCount")[0] if dgm_count == 0: no_dependency_set.add(repo_file) continue packageName_list = jsonpath.jsonpath( json_dic, "$.data.repository.dependencyGraphManifests.nodes[" "*].dependencies.nodes[*].packageName") if packageName_list == False: no_dependency_set.add(repo_file) continue # 过滤依赖数量不够的repo if len(set(packageName_list)) < min_dependency_count: less_dependency_set.add(repo_file) continue # res_set.add(repo_file) # res_dic[nameWithOwner].append(repo_file) # for key, val in res_dic.items(): # if len(val) > 1: # dup_dic[key] = val # res = list(res_dic.keys()) # return res return dup_set, no_dependency_set, less_dependency_set
def get_file_name_not_match_nameWithOwner(repo_dir): res = [] should_file_name_list = [] for i, repo_file in enumerate(os.listdir(repo_dir)): if os.path.splitext(repo_file)[1] != '.json': continue json_dic = read_json_file(os.path.join(repo_dir, repo_file)) nameWithOwner = jsonpath.jsonpath(json_dic, '$.data.repository.nameWithOwner')[0] should_file_name = nameWithOwner.replace('/', '-$-') + '.json' if should_file_name != repo_file: res.append((repo_file, should_file_name)) # should_file_name_list.append(should_file_name) # should_file_name_set = set(should_file_name_list) return res
def get_data_topic_repo_set(topic_repo_path): res_list = [] for topic_index, topic_dir in enumerate(os.listdir(topic_repo_path)): for page_index, page_file in enumerate( os.listdir(os.path.join(topic_repo_path, topic_dir))): if os.path.splitext(page_file)[1] != ".json": continue json = read_json_file( os.path.join(topic_repo_path, topic_dir, page_file)) # 预处理,忽略fork的仓库 private仓库 for item in json["items"]: exclude = item["size"] == 0 or item["fork"] or item["private"] if exclude is True: continue res_list.append(item["full_name"]) return set(res_list)
def construct_label_data(recommender_result_json_file_dic, out_csv): """ :param out_csv: :param recommender_result_json_file_dic: key:推荐器名称 val:结果文件 :return: """ recommender_result_json_dic = {} for recommender in recommender_result_json_file_dic.keys(): recommender_result_json_dic[recommender] = read_json_file( recommender_result_json_file_dic[recommender]) # 这个字典需要增量更新 repo_package_row_dic = defaultdict(dict) for recommender, reco_result_json_dic in recommender_result_json_dic.items( ): # for repo_i, record_list in reco_result_json_dic.items(): for repo_i, record_dic in reco_result_json_dic.items(): # for record in record_list: for record in record_dic['package']: # package_j = record['nameWithManager'] package_j = record['key'] if repo_package_row_dic[repo_i].get(package_j) is None: repo_package_row_dic[repo_i][package_j] = defaultdict(dict) # 默认所有 entity 的 score为 ? for key in recommender_result_json_file_dic.keys(): repo_package_row_dic[repo_i][package_j][ key] = np.NAN # '?' repo_package_row_dic[repo_i][package_j][recommender] = record[ 'score'] if record.get('repoDegree') is None: print(1) repo_package_row_dic[repo_i][package_j]['repoDegree'] = record[ 'repoDegree'] repo_package_row_dic[repo_i][package_j]['hit'] = record['hit'] # 把嵌套字典展平 res_row_list = [] for repo_i, val1 in repo_package_row_dic.items(): for package_j, val2 in val1.items(): # repo 和 package 列 res_row_dic = {'repo_i': repo_i, 'package_j': package_j} # 其他属性列 for key in val2.keys(): res_row_dic[key] = val2[key] res_row_list.append(res_row_dic) pd.DataFrame(res_row_list).to_csv(out_csv, index=False) pass
def stat_dependencyGraphManifests(repo_path): res = { "has_dfm": 0, "no_dfm": 0, } for repo_index, repo_file in enumerate(os.listdir(repo_path)): if os.path.splitext(repo_file)[1] != ".json": continue json_dic = read_json_file(os.path.join(repo_path, repo_file)) dependencyGraphManifests_count = jsonpath.jsonpath( json_dic, "$.data.repository.dependencyGraphManifests.totalCount")[0] if dependencyGraphManifests_count == 0: res["no_dfm"] += 1 else: res["has_dfm"] += 1 return res
def repo_property(): json_dic = read_json_file( r"C:\Disk_Data\Small_Data\Neo4j\tensorflow-$-tensorflow.json") # count = jsonpath.jsonpath(json_dic, "$.data.repository.dependencyGraphManifests.totalCount")[0] packageName_list = jsonpath.jsonpath( json_dic, "$.data.repository.dependencyGraphManifests.nodes[*].dependencies.nodes[*].packageName" ) # tensorflow有两个"repository": null repository_list = jsonpath.jsonpath( json_dic, "$.data.repository.dependencyGraphManifests.nodes[*].dependencies.nodes[*].repository.nameWithOwner" ) topic_list = jsonpath.jsonpath( json_dic, "$.data.repository.repositoryTopics.nodes[*].topic.name") packageName_set = set(packageName_list) repository_set = set(repository_list)
def stat_dependency_count_dic(repo_dir, nameWithOwner, train_package_set): repo_file = nameWithOwner.replace('/', '-$-') + '.json' file_path = os.path.join(repo_dir, repo_file) json_dic = read_json_file(file_path) ground_truth_dependency_nodes_list = jsonpath.jsonpath( json_dic, "$.data.repository.dependencyGraphManifests.nodes[*].dependencies.nodes[*]" ) depended_count_dic = defaultdict(int) train_package_depended_count = 0 for node in ground_truth_dependency_nodes_list: nameWithManager = re.sub( re.compile(r'\s+'), '', node.get("packageManager") + "/" + node.get("packageName")) if nameWithManager in train_package_set: train_package_depended_count += 1 depended_count_dic[nameWithManager] += 1 return depended_count_dic, train_package_depended_count
def check_multi_page_100(repo_path): over_100_dic = { "dependencyGraphManifests_count": set(), "max_dependency_count": set(), "language_count": set(), "topic_count": set() } for repo_index, repo_file in enumerate(os.listdir(repo_path)): if os.path.splitext(repo_file)[1] != ".json": continue json = read_json_file(os.path.join(repo_path, repo_file)) try: repo = json["data"]["repository"] repoName = repo["nameWithOwner"] exclude = repo["isEmpty"] or repo["isFork"] or repo[ "isLocked"] or repo["isPrivate"] if exclude: print("should be excluded: " + repo_path + repo_file) dependencyGraphManifests_count = repo["dependencyGraphManifests"][ "totalCount"] if dependencyGraphManifests_count > 100: over_100_dic["dependencyGraphManifests_count"].add(repoName) max_dependency_count = 0 for node in repo["dependencyGraphManifests"]["nodes"]: dependency_count = node["dependencies"]["totalCount"] if dependency_count > max_dependency_count: max_dependency_count = dependency_count if max_dependency_count > 100: over_100_dic["max_dependency_count"].add(repoName) language_count = repo["languages"]["totalCount"] if language_count > 100: over_100_dic["language_count"].add(repoName) topic_count = repo["repositoryTopics"]["totalCount"] if topic_count > 100: over_100_dic["topic_count"].add(repoName) except Exception as e: print(e) print("exception at: " + repo_path + repo_file) return over_100_dic
def stat_topic_set(repo_dir_path): topic_list = [] for repo_index, repo_file in enumerate(os.listdir(repo_dir_path)): if os.path.splitext(repo_file)[1] != ".json": continue json_dic = read_json_file(repo_dir_path + "/" + repo_file) dependencyGraphManifests_count = jsonpath.jsonpath( json_dic, "$.data.repository.dependencyGraphManifests.totalCount")[0] if dependencyGraphManifests_count == 0: continue repo_topic_list = jsonpath.jsonpath( json_dic, "$.data.repository.repositoryTopics.nodes[*].topic.name") if repo_topic_list is False: continue topic_list.extend(repo_topic_list) topic_set = set(topic_list) sorted_topic_set = list(topic_set) sorted_topic_set.sort() df = pd.DataFrame(columns=["topic"], data=sorted_topic_set) df.to_csv( r"C:\Disk_Dev\Repository\github-KG\github-KG-python\tx_data\resource\paperswithcode\sorted_topic_set_hasdfm.csv", encoding='utf-8') return sorted_topic_set
def jsonpath_not_exist_key(): json_dic = read_json_file( r"C:\Disk_Data\Small_Data\Neo4j\tensorflow-$-tensorflow.json") error = jsonpath.jsonpath(json_dic, "$.errors") return error
def add_recommender_attr_and_label(reco_result_json_file, package_degree_csv, **kwargs): """ 数据集全作为测试集 :param package_degree_csv: :param reco_result_json_file: :param kwargs: :return: """ # if len(kwargs.get('test_ks')) != kwargs.get('split_M'): # print('数据集应全作为测试集') # exit(0) if kwargs.get('topN') < 1000: print('topN param error') exit(0) # train_row_list, test_row_list, train_dic, test_dic = split_train_test_dic(**kwargs) # total_test_repo_set = test_dic.keys() # payload_repo_set = sorted(total_test_repo_set) init_train_test_dic = split_portrait_train_test_by_count(**kwargs) vali_repo_list = init_train_test_dic.keys() payload_repo_set = set() for nameWithOwner in vali_repo_list: # 有训练集 且 有测试集的仓库取交集,也就是保留有测试集的仓库 if len(init_train_test_dic[nameWithOwner]['test']['package']) > 0: payload_repo_set.add(nameWithOwner) payload_repo_set = sorted(payload_repo_set) reco_result_json_dic = read_json_file(reco_result_json_file) package_degree_records = pd.read_csv(package_degree_csv).to_dict( orient='records') package_degree_dic = {} for dic in package_degree_records: package_degree_dic[dic['nameWithManager']] = dic['repoDegree'] add_label_result_json_dic = OrderedDict() # 对测试集中每个repo核对推荐结果 for index, nameWithOwner in enumerate(payload_repo_set): print('-' * 100) print("index: " + str(index + 1) + "/" + str(len(payload_repo_set)) + ", repo: " + str(nameWithOwner)) # copy一份原结果,在copy结果上修改、增加 add_label_result_json_dic[nameWithOwner] = reco_result_json_dic[ nameWithOwner] # test_package_set = set(test_dic[nameWithOwner]) test_package_set = set( init_train_test_dic[nameWithOwner]['test']['package']) train_package_set = set([ dic['key'] for dic in init_train_test_dic[nameWithOwner]['train']['package'] ]) reco_package_set = set() # 依次检查 test_package_set 每个 package 是否命中 # reco_result_package_record_list = reco_result_json_dic[nameWithOwner] reco_package_record_list = add_label_result_json_dic[nameWithOwner][ 'package'] for record in reco_package_record_list: # nameWithManager = record['nameWithManager'] nameWithManager = record['key'] reco_package_set.add(nameWithManager) # 标记流行度 record['repoDegree'] = package_degree_dic[nameWithManager] # 如果命中(测试集里有),则标记hit为1,没命中则标记为0(是否hit正负例完全取决于测试集里有没有) if nameWithManager in test_package_set: # 可以做到修改原字典 record['hit'] = 1 else: record['hit'] = 0 # 对于推荐列表中没有而在测试集中有的,新增记录,score记录为?,hit为 1 # 所以:最终标记数据集是 推荐列表(精度)和 测试集(召回)的并集 # not_in_reco_but_in_test_package_set = test_package_set - reco_package_set not_in_reco_and_train_but_in_test_package_set = test_package_set - train_package_set - reco_package_set # for package in not_in_reco_but_in_test_package_set: for package in not_in_reco_and_train_but_in_test_package_set: # reco_result_json_dic[nameWithOwner].append({'score': '?', reco_package_record_list.append({ 'score': '?', # 'nameWithManager': package, 'key': package, 'repoDegree': package_degree_dic[package], 'hit': 1 }) # dump 结果 to_reco_csv_file_name = kwargs.get('to_reco_csv_file').split('\\')[-1:][0] # split_dir_name = 'split_' + str(kwargs.get('split_M')) + '_test_' + "_".join(map(lambda x: str(x), kwargs.get('test_ks'))) out_dir = os.path.join(kwargs.get('out_dir'), os.path.splitext(to_reco_csv_file_name)[0], 'init_' + str(kwargs.get('init_input_count')), 'iter_' + str(kwargs.get('interaction_iter_count')), 'top' + str(kwargs.get('topN')), 'label') dump_file_name = os.path.splitext( reco_result_json_file.split('\\')[-1:][0])[0] + '_label_' + str( uuid1()) + '.json' write_json_file(out_dir, os.path.join(out_dir, dump_file_name), add_label_result_json_dic) pass