def get(self, uid=None): """Get a repo based on a uid. Exits on error if doesn't exist. If a uid is not provided, get the last updated repository. """ from rse.main.database.models import SoftwareRepository # Retrieve either the last repo, or the one with a specific uid if not uid: repo = (self.session.query(SoftwareRepository).order_by( desc("timestamp")).first()) parser = get_parser(repo.uid, config=self.config) if not repo: raise NoReposError else: parser = get_parser(uid, config=self.config) repo = SoftwareRepository.query.filter( SoftwareRepository.uid == parser.uid).first() # If an exact match isn't there, look for partial match if not repo: query = "%" + parser.uid + "%" query = self.session.query(SoftwareRepository).filter( SoftwareRepository.uid.ilike(query)) results = self.session.execute(query).fetchall() if len(results) == 1: return self.get(results[0][0]) elif len(results) > 1: raise MultipleReposExistError(parser.uid) else: raise RepoNotFoundError(parser.uid) repo.parser = parser return repo
def _import_annotation(self, input_file, username, stop_line="## Criteria"): """A general helper (private) function to import an annotation, meaning we parse a repository and return additional lines for parsing. """ if not username or not input_file: raise RuntimeError( "A username and input file are required to import annotation criteria." ) if not os.path.exists(input_file): raise FileNotFoundError(input_file) lines = read_file(input_file) line = lines.pop(0) # Find the repository name while stop_line not in line: match = re.search(repository_regex, line) if match: break line = lines.pop(0) # Retrieve the match if not match: raise RuntimeError(f"repository pattern not found in {input_file}") reponame = match.group() parser = get_parser(reponame, config=self.config) repo = self.get(parser.uid) return repo, lines
def exists(self, uid): """Determine if a repo exists.""" from rse.main.database.models import SoftwareRepository parser = get_parser(uid, config=self.config) repo = SoftwareRepository.query.filter( SoftwareRepository.uid == parser.uid).first() return repo is not None
def get_or_create(self, uid): """Determine if a repo exists.""" from rse.main.database.models import SoftwareRepository parser = get_parser(uid, config=self.config) repo = SoftwareRepository.query.filter( SoftwareRepository.uid == parser.uid).first() if not repo: repo = self.add(uid) return repo
def create(self, database=None, config_file=None): """After a scrape (whether we obtain latest or a search query) we run create to create software repositories based on results. """ from rse.main import Encyclopedia client = Encyclopedia(config_file=config_file, database=database) for repo_id in self.results: repo = get_parser(repo_id) # Add results that don't exist if not client.exists(repo.uid): client.add(repo.uid)
def analyze(self, repo, cthresh=0.5, tthresh=1, taxonomy_uids=None, criteria_uids=None): """analyze takes a repository and calculates a "final answer" based on user provided thresholds """ # If taxonomy or criteria lists aren't defined, use all if not taxonomy_uids: taxonomy_uids = [x["uid"] for x in self.list_taxonomy()] if not criteria_uids: criteria_uids = [x["uid"] for x in self.list_criteria()] parser = get_parser(repo, config=self.config) repo = self.get(parser.uid) metrics = {"repo": parser.uid, "criteria": {}, "taxonomy": {}} # Calculate "final" answers for each criteria based on votes and threshold counts = {} for name, votes in repo.get_criteria().items(): # Skip criteria if not important if name not in criteria_uids: continue if name not in counts: counts[name] = {"yes": 0, "no": 0, "total": 0} for username, response in votes.items(): counts[name][response] += 1 counts[name]["total"] += 1 # Calculate final answers! for name, summary in counts.items(): if summary["yes"] / summary["total"] >= cthresh: metrics["criteria"][name] = "yes" else: metrics["criteria"][name] = "no" counts = {} for username, categories in repo.get_taxonomy().items(): for category in categories: if category not in counts: counts[category] = 0 counts[category] += 1 # Include those above the requested threshold for name, count in counts.items(): if count >= tthresh: metrics["taxonomy"][name] = count return metrics
def create(self, database=None, config_file=None): """After a scrape (whether we obtain latest or a search query) we run create to create software repositories based on results. """ from rse.main import Encyclopedia client = Encyclopedia(config_file=config_file, database=database) for result in self.results: uid = result["url"].split("//")[-1] repo = get_parser(uid) # Add results that don't exist if not client.exists(repo.uid): client.add(repo.uid) if result.get("doi"): client.label(repo.uid, key="doi", value=result.get("doi"))
def add(self, uid): """Add a new software repository to the database.""" if uid: parser = get_parser(uid, config=self.config) data = parser.get_metadata() # If it's a parser handoff if isinstance(data, ParserBase): parser = data data = parser.data if data: bot.info(f"{parser.uid} was added to the the database.") return SoftwareRepository(parser, data_base=self.data_base) else: bot.error("Please define a unique identifier to add.")
def yield_taxonomy_annotation_repos(self, username, unseen_only=True, repo=None): """Given a username, repository, and preference for seen / unseen, yield a repository to annotate. """ if repo is None: repos = self.list() else: parser = get_parser(repo, config=self.config) repos = [[parser.uid]] unseen_only = False # yield combinations that don't exist yet, repo first to save changes for name in repos: repo = self.get(name[0]) if unseen_only and not repo.has_taxonomy_annotation(username): yield repo elif not unseen_only: yield repo
def get(self, uid=None, exact=False): """Get a software repo based on a uid. If exact is not needed, we can search for a match based on the partial uid. If exact is False, and a uid is not provided, get the last repository created. """ if not uid and not exact: repos = get_latest_modified(self.data_base, pattern="metadata*.json") if repos: uid = ( repos.replace("metadata.json", "") .replace(self.data_base, "") .strip("/") ) if not uid or not repos: raise NoReposError parser = get_parser(uid, config=self.config) return SoftwareRepository(parser, exists=True, data_base=self.data_base)
def create(self, database=None, config_file=None): """After a scrape (whether we obtain latest or a search query) we run create to create software repositories based on results. """ from rse.main import Encyclopedia client = Encyclopedia(config_file=config_file, database=database) for result in self.results: uid = result["url"].split("//")[-1] # If a repository is added that isn't represented try: repo = get_parser(uid) except NotImplementedError as exc: bot.warning(exc) continue # Add results that don't exist if not client.exists(repo.uid): client.add(repo.uid) client.label(repo.uid, key="doi", value=result.get("doi"))
def add(self, uid): """Create a new repo based on a uid that matches to a parser.""" from rse.main.database.models import SoftwareRepository parser = get_parser(uid, config=self.config) if not self.exists(parser.uid): data = parser.get_metadata() # If it's a parser handoff if isinstance(data, ParserBase): parser = data data = parser.data if data: repo = SoftwareRepository(uid=parser.uid, parser=parser.name, data=json.dumps(parser.export())) self.session.add(repo) self.session.commit() bot.info(f"{parser.uid} was added to the the database.") repo.parser = parser return repo
def get_metadata(self, uri=None, require_repo=True): """Retrieve repository metadata. The common metadata (timestamp) is added by the software repository parser, and here we need to ensure that the url field is populated with a correct url. Arguments: uri (str) : a repository uri string to override one currently set require_repo (bool) : require a repository to parse. """ from rse.main.parsers import get_parser from rse.utils.urls import repository_regex repository_regex = repository_regex.rstrip("$") if uri: self.set_uri(uri) self.load_secrets() # Get the record number from the doi record = self.uid.split("/")[-1].replace("zenodo.", "") # Token isn't required for public entries if self.token: response = requests.get( "https://zenodo.org/api/records/%s" % record, json={"access_token": self.token}, ) else: response = requests.get("https://zenodo.org/api/records/%s" % record) # Successful query! if response.status_code == 200: self.data = response.json() # For Zenodo, we require a GitHub or GitLab related identifier to add repo_url = None for identifier in self.data["metadata"].get( "related_identifiers", []): match = re.search(repository_regex, identifier["identifier"]) if match: repo_url = "https://%s" % match.group() break # If we return None, the entry is not added if repo_url is None and require_repo is True: bot.warning( "Repository url not found with Zenodo record, skipping add." ) return repo_url # Convert the class into another parser type elif repo_url is not None: uid = self.uid self = get_parser(repo_url) self.get_metadata() self.data["doi"] = uid return self return self.data elif response.status_code == 404: bot.error(f"Cannot find doi {self.uid}.") elif response.status_code in [400, 401, 403]: bot.error(f"Permission denied to query {self.uid}") else: bot.error( f"Cannot get doi {self.uid}: {response.status_code}, {response.reason}" ) return None
def exists(self, uid): """based on a parser type and unique identifier, determine if software exists in the database """ parser = get_parser(uid, config=self.config) return self.db.exists(parser.uid)
def summary(self, repo=None): """Summarize metrics for the entire database if uid is not defined, or one specific repository. """ if repo is None: repos = self.list() metrics = {"repos": len(repos)} else: parser = get_parser(repo, config=self.config) repos = [[parser.uid]] metrics = {"repo": parser.uid} # Add taxonomy and criteria items metrics["taxonomy-count"] = len(self.list_taxonomy()) metrics["criteria-count"] = len(self.list_criteria()) metrics["users"] = {} metrics["taxonomy"] = {} metrics["criteria"] = {} # Count annotations for for repo in repos: parser = get_parser(repo[0], config=self.config) repo = self.get(parser.uid) if not repo.criteria and not repo.taxonomy: continue # Add repository to summary metrics metrics["taxonomy"][repo.uid] = {} metrics["criteria"][repo.uid] = {} # Derive all users that have annotated taxonomy/criteria users = set() for name, votes in repo.get_criteria().items(): [users.add(user) for user in votes.keys()] if name not in metrics["criteria"][repo.uid]: metrics["criteria"][repo.uid] = {"yes": 0, "no": 0} for vote in votes.values(): metrics["criteria"][repo.uid][vote] += 1 # Update criteria annotations for user in users: if user not in metrics["users"]: metrics["users"][user] = { "criteria-annotations": 0, "taxonomy-annotations": 0, } metrics["users"][user]["criteria-annotations"] += 1 # Derive all users that have annotated taxonomy/criteria users = set() for username, categories in repo.get_taxonomy().items(): users.add(username) for category in categories: if category not in metrics["taxonomy"][repo.uid]: metrics["taxonomy"][repo.uid][category] = 0 metrics["taxonomy"][repo.uid][category] += 1 # Don't add empty entries if not repo.taxonomy and repo.uid in metrics["taxonomy"]: del metrics["taxonomy"][repo.uid] if not repo.criteria and repo.uid in metrics["criteria"]: del metrics["criteria"][repo.uid] # Add unique users metrics["users-count"] = len(metrics["users"]) return metrics