def getDependencies(input_file, output_file, git_hub_key, hop, partitions): dependency_data = {} no_dependency_data = [] gitLink = [] gh = xkcd2347.GitHub(key=git_hub_key) result = [] def process_line(link): link = link.rstrip() owner, repo = link.split(".com/")[1].split("/") result = [ dep['packageName'] for dep in gh.get_dependencies( repo_owner=owner, repo_name=repo, depth=hop) ] if (len(result) > 0): return (link, result) else: return (link, ["No dependency"]) try: sc = SparkContext("local[*]", "PySpark Tutorial") result = sc.textFile(input_file, partitions).map(process_line).collect() except: print("Error running spark job") with open(output_file, 'w') as fp: json.dump(dict(result), fp, indent=4) sc.stop()
def getDependencies(input_file, output_file, git_hub_key, hop): dependency_data = {} no_dependency_data = [] gitLink = [] gh = xkcd2347.GitHub(key=git_hub_key) with open(input_file, 'r') as fp: gitLink = fp.readlines() with open(output_file, 'a') as fp: for link in tqdm(gitLink): link = link.rstrip() owner, repo = link.split(".com/")[1].split("/") result = [ dep['packageName'] for dep in gh.get_dependencies( repo_owner=owner, repo_name=repo, depth=hop) ] dependency_data = {} if (len(result) > 0): dependency_data[link] = result json.dump(dependency_data, fp) fp.write("\n") else: no_dependency_data.append(link) with open('no_result_data_' + output_file, 'w') as fp: json.dump(no_dependency_data, fp)
def test_cache(): cache_dir = pathlib.Path('test-cache') if cache_dir.exists(): shutil.rmtree(cache_dir) cache_dir.mkdir() cache = diskcache.Cache(cache_dir) gh = xkcd2347.GitHub(key=key, cache=cache) deps = list(gh.get_dependencies('docnow', 'twarc')) assert len(deps) > 0 assert deps[0]['repository']['owner']['login']
def getDependencies(input_file, output_file, git_hub_key, hop): dependency_data = {} no_dependency_data = [] gitLink = [] with open(input_file, 'r') as fp: gitLink = fp.readlines() link = gitLink[0] #for link in tqdm(gitLink): gh = xkcd2347.GitHub(key=git_hub_key) link = link.rstrip() owner, repo = link.split(".com/")[1].split("/") result = [ dep['packageName'] for dep in gh.get_dependencies( repo_owner=owner, repo_name=repo, depth=hop) ] if (len(result) > 0): dependency_data[link] = result with open(output_file, 'w') as fp: json.dump(dependency_data, fp, indent=4)
def test_get_dependencies(): gh = xkcd2347.GitHub(key=key) deps = list(gh.get_dependencies('docnow', 'twarc')) assert len(deps) > 0 assert deps[0]['repository']['owner']['login']