from scraper.github import queryManager as qm import os.path yearDict = {} # Gather all file name data print("Checking GitHub data file names with year stamps...") for file in os.listdir("../github-data"): if file.endswith(".json"): nameSplit = file.split(".") # Must have format "somePrefix.0000.json" if not nameSplit[0] == "YEARS" and nameSplit[1].isdigit(): prefix = nameSplit[0] yearX = int(nameSplit[1]) if prefix not in yearDict: yearDict[prefix] = [] yearDict[prefix].append(yearX) print("Sorting year data...") # Remove duplicate years (though shouldn't be possible) and sort list for prefix in yearDict.keys(): yearList = yearDict[prefix] yearList = list(set(yearList)) yearList.sort() yearDict[prefix] = yearList yearData = qm.DataManager("../github-data/YEARS.json", False) yearData.fileSave() print("Done!\n")
from scraper.github import queryManager as qm from os import environ as env ghDataDir = env.get("GITHUB_DATA", "../github-data") datfilepath = "%s/labUsers.json" % ghDataDir queryPath = "../queries/org-Members.gql" # Only looking at NIST org members orglist = ["usnistgov"] # Initialize data collector dataCollector = qm.DataManager(datfilepath, False) dataCollector.data = {"data": {}} # Initialize query manager queryMan = qm.GitHubQueryManager() # Iterate through orgs of interest print("Gathering data across multiple paginated queries...") for org in orglist: print("\n'%s'" % (org)) try: outObj = queryMan.queryGitHubFromFile( queryPath, {"orgName": org, "numUsers": 50, "pgCursor": None}, paginate=True, cursorVar="pgCursor", keysToList=["data", "organization", "membersWithRole", "nodes"], ) except Exception as error:
from scraper.github import queryManager as qm from os import environ as env ghDataDir = env.get('GITHUB_DATA', '../github-data') datfilepath = "%s/extRepos.json" % ghDataDir queryPath = "../queries/user-Repos.gql" # Read repo info data file (to use as repo list) inputLists = qm.DataManager("%s/labReposInfo.json" % ghDataDir, True) # Populate repo list repolist = [] print("Getting internal repos ...") repolist = sorted(inputLists.data["data"].keys()) print("Repo list complete. Found %d repos." % (len(repolist))) # Read lab user data file (to use as member list) inputLists = qm.DataManager("%s/labUsers.json" % ghDataDir, True) # Populate member list memberlist = [] print("Getting LLNL members ...") memberlist = sorted(inputLists.data["data"].keys()) print("Member list complete. Found %d users." % (len(memberlist))) # Initialize data collector dataCollector = qm.DataManager(datfilepath, False) dataCollector.data = {"data": {}} # Initialize query manager queryMan = qm.GitHubQueryManager() # Iterate through lab members
from scraper.github import queryManager as qm from os import environ as env ghDataDir = env.get('GITHUB_DATA', '../github-data') datfilepathExt = "%s/extUsers.json" % ghDataDir datfilepathInt = "%s/labUsers.json" % ghDataDir queryPath = "../queries/repo-Users.gql" # Read repo info data file (to use as repo list) inputLists = qm.DataManager("%s/labReposInfo.json" % ghDataDir, True) # Populate repo list repolist = [] print("Getting internal repos ...") repolist = sorted(inputLists.data["data"].keys()) print("Repo list complete. Found %d repos." % (len(repolist))) # Initialize internal user data collector # and Read lab user data file (to use as member list) dataCollectorInt = qm.DataManager(datfilepathInt, True) # Populate member list memberlist = [] print("Getting LLNL members ...") memberlist = sorted(dataCollectorInt.data["data"].keys()) print("Member list complete. Found %d users." % (len(memberlist))) # Initialize external user data collector dataCollectorExt = qm.DataManager(datfilepathExt, False) dataCollectorExt.data = {"data": {}} # Initialize query manager queryMan = qm.GitHubQueryManager()
from scraper.github import queryManager as qm ghDataDir = "../../explore/github-data" genDatafile = "%s/labReposInfo.json" % ghDataDir topicsDatafile = "%s/labRepos_Topics.json" % ghDataDir writeFile = "%s/labRepo_Metadata.json" % ghDataDir # initialize data manager and load repo info genDataCollector = qm.DataManager(genDatafile, True) # initialize data manager and load repo topics topicsCollector = qm.DataManager(topicsDatafile, True) # initialize data manager to write collected info infoWriter = qm.DataManager(writeFile, False) print("\nGathering repo metadata...\n") # iterate through repos for repo in genDataCollector.data["data"]: repoData = {} repoObj = genDataCollector.data["data"][repo] repoData["name"] = repo repoData["description"] = repoObj["description"] repoData["website"] = repoObj["homepageUrl"] # gather any repo topics if repoObj["repositoryTopics"]["totalCount"] > 0:
ghDataDir = env.get('GITHUB_DATA', '../github-data') yearDict = {} # Gather all file name data print("Checking GitHub data file names with year stamps...") if not os.path.exists(ghDataDir): raise FileNotFoundError("Directory path '%s' does not exist." % (ghDataDir)) for file in os.listdir(ghDataDir): if file.endswith(".json"): nameSplit = file.split(".") # Must have format "somePrefix.0000.json" if not nameSplit[0] == "YEARS" and nameSplit[1].isdigit(): prefix = nameSplit[0] yearX = int(nameSplit[1]) if prefix not in yearDict: yearDict[prefix] = [] yearDict[prefix].append(yearX) print("Sorting year data...") # Remove duplicate years (though shouldn't be possible) and sort list for prefix in yearDict.keys(): yearList = yearDict[prefix] yearList = list(set(yearList)) yearList.sort() yearDict[prefix] = yearList yearData = qm.DataManager("%s/YEARS.json" % ghDataDir, False) yearData.fileSave() print("Done!\n")
from scraper.github import queryManager as qm # Take all input lists, process, and write back to file fileIn = "../input_lists.json" inputLists = qm.DataManager("../input_lists.json", True) print("Cleaning input lists...") for aList in inputLists.data.keys(): print("\t%s" % aList) # Standardize as all lowercase listWIP = [x.lower() for x in inputLists.data[aList]] listWIP = list(set(listWIP)) # Remove duplicates listWIP.sort() # List in alphabetical order inputLists.data[aList] = listWIP inputLists.fileSave() print("Input lists cleaned!")
from scraper.github import queryManager as qm import re datfilepath = "../github-data/labRepos_CreationHistory.json" queryPath = "../queries/repo-CreationDate.gql" query_commits_in = "/repos/OWNNAME/REPONAME/commits?until=CREATETIME&per_page=100" query_commits_in2 = "/repos/OWNNAME/REPONAME/commits?per_page=100" # Read repo info data file (to use as repo list) inputLists = qm.DataManager("../github-data/labReposInfo.json", True) # Populate repo list repolist = [] print("Getting internal repos ...") repolist = sorted(inputLists.data["data"].keys()) print("Repo list complete. Found %d repos." % (len(repolist))) # Initialize data collector dataCollector = qm.DataManager(datfilepath, False) try: # Load existing data dataCollector.fileLoad() except FileNotFoundError as error: # If no existing data, initialize the data object dataCollector.data = {"data": {}} # Initialize query manager queryMan = qm.GitHubQueryManager() # Iterate through internal repos print("Gathering data across multiple paginated queries...") for repo in repolist:
from scraper.github import queryManager as qm from os import environ as env ghDataDir = env.get("GITHUB_DATA", "../github-data") datfilepath = "%s/intUsers.json" % ghDataDir queryPath = "../queries/org-Members.gql" # Read input list of member organizations inputLists = qm.DataManager("../input_lists.json", True) orglist = inputLists.data["memberOrgs"] # Initialize data collector dataCollector = qm.DataManager(datfilepath, False) dataCollector.data = {"data": {}} # Initialize query manager queryMan = qm.GitHubQueryManager() # Iterate through orgs of interest print("Gathering data across multiple paginated queries...") for org in orglist: print("\n'%s'" % (org)) try: outObj = queryMan.queryGitHubFromFile( queryPath, { "orgName": org, "numUsers": 50, "pgCursor": None },
from scraper.github import queryManager as qm from os import environ as env ghDataDir = env.get('GITHUB_DATA', '../github-data') datfilepath = "%s/labReposSubsets.json" % ghDataDir # Read repo info data file (to use as repo list) inputLists = qm.DataManager("%s/labReposInfo.json" % ghDataDir, True) # Populate repo list repolist = [] print("Getting internal repos...") repolist = sorted(inputLists.data["data"].keys()) print("Repo list complete. Found %d repos." % (len(repolist))) # Read subset input lists of orgs and repos subsetLists = qm.DataManager("../input_lists_subsets.json", True) myTags = sorted(subsetLists.data.keys()) # Initialize data collector dataCollector = qm.DataManager(datfilepath, False) dataCollector.data = {"data": {}} # Helper function - test if matches org only def matchOrg(orgName, repoName): # TODO orgName = orgName.lower() repoName = repoName.lower() if orgName == repoName.split("/")[0]: return True else:
from scraper.github import queryManager as qm from os import environ as env ghDataDir = env.get("GITHUB_DATA", "../github-data") datfilepath = "%s/dependencyInfo.json" % ghDataDir queryPath = "../queries/dependency-Info.gql" # Read repo info data file (to use as repo list) inputLists = qm.DataManager("%s/intRepos_Dependencies.json" % ghDataDir, True) # Populate repo list repolist = [] print("Getting dependency repos ...") for repoName in inputLists.data["data"]: for node in inputLists.data["data"][repoName]["dependencyGraphManifests"]["nodes"]: for repo in node["dependencies"]["nodes"]: if ( repo["repository"] is not None and repo["repository"]["nameWithOwner"] is not None ): repolist.append(repo["repository"]["nameWithOwner"]) repolist = list(dict.fromkeys(repolist)) repolist = sorted(repolist) print("Repo list complete. Found %d repos." % (len(repolist))) # Initialize data collector dataCollector = qm.DataManager(datfilepath, False) dataCollector.data = {"data": {}} # Initialize query manager queryMan = qm.GitHubQueryManager()