def scrape(self, url, paginate=False, delay=0.0): """A shared function to scrape a set of repositories. Since the JoSS pages for a search and the base are the same, we can use a shared function. """ # Api doesn't appear to have pagination response = requests.get(url, headers={"User-Agent": get_user_agent()}) data = check_response(response) for entry in data.get("response", {}).get("docs", []): page_url = entry["uri_s"] response = requests.get(page_url, headers={"User-Agent": get_user_agent()}) repo_url = None if response.status_code == 200: match = re.search(repository_regex, response.text, re.IGNORECASE) if match: repo_url = match.group() if repo_url: bot.info("Found repository: %s" % repo_url) self.results.append(repo_url) time.sleep(delay) return self.results
def scrape(self, url, paginate=False, delay=None): """A shared function to scrape a set of repositories. Since the JoSS pages for a search and the base are the same, we can use a shared function. """ try: from bs4 import BeautifulSoup except ImportError: sys.exit("BeautifulSoup is required. pip install rse[scraper].") # Handle pagination while url is not None: response = requests.get(url, headers={"User-Agent": get_user_agent()}) soup = BeautifulSoup(response.text, "html.parser") url = None for link in soup.find_all("link", href=True): # Sleep for a random amount of time to give a rest! sleep(delay or random.choice(range(1, 10)) * 0.1) paper_url = link.attrs.get("href", "") # If we don't have the next page yet if link.attrs.get( "rel") is not None and url is None and paginate: if link.attrs.get("rel")[0] == "next": url = link.attrs.get("href") # Retrieve page with paper metadata that we need if re.search( "https://joss.theoj.org/papers/10.[0-9]{5}/joss.[0-9]{5}", paper_url): response = requests.get( paper_url, headers={"User-Agent": get_user_agent()}) paper_soup = BeautifulSoup(response.text, "html5lib") # Find links that we need repo = {} for link in paper_soup.find_all("a", href=True): if "Software repository" in link.text: repo["url"] = link.attrs.get("href", "") elif "Software archive" in link.text: repo["doi"] = link.attrs.get("href", "") if repo.get("url") and repo.get("doi"): bot.info("Found repository: %s" % repo["url"]) self.results.append(repo) return self.results
def scrape(self, url, paginate=False, delay=None): """A shared function to scrape a set of repositories. Since the JoSS pages for a search and the base are the same, we can use a shared function. """ # Handle pagination original_url = url while url is not None: response = requests.get(url, headers={"User-Agent": get_user_agent()}) data = check_response(response) # Reset the url to be None url = None if data.get("next") and paginate: url = original_url + "&page=%s" % data.get("next", "").replace( "?page=", "" ) for entry in data.get("list", []): # Look for GitHub / GitLab URL repo = {} for link in entry.get("link", []): if "Repository" in link["type"] and re.search( repository_regex, link["url"], re.IGNORECASE ): repo["url"] = link["url"] # If we don't have a repository, search the homepage if not repo.get("url") and re.search( repository_regex, entry["homepage"] ): repo["url"] = entry["homepage"] # We must have a repository url to parse if not repo.get("url"): continue # Look for a doi for pub in entry["publication"]: if pub.get("doi"): repo["doi"] = pub.get("doi") bot.info("Found repository: %s" % repo["url"]) self.results.append(repo) # Sleep for a random amount of time to give a rest! sleep(delay or random.choice(range(1, 10)) * 0.1) return self.results
def scrape(self, url, paginate=False, delay=0.0): """A shared function to scrape a set of repositories. Since the JoSS pages for a search and the base are the same, we can use a shared function. """ response = requests.get(url, headers={"User-Agent": get_user_agent()}) data = check_response(response) or [] for entry in data: # I only see GitHub urls repo_url = entry.get("repositoryURLs", {}).get("github") repo_url = repo_url[0] if repo_url else None doi = entry.get("conceptDOI") doi = doi if doi and "FIXME" not in doi else None if repo_url and doi: bot.info("Found repository: %s" % repo_url) self.results.append({"url": repo_url, "doi": doi}) elif repo_url: bot.info("Found repository: %s" % repo_url) self.results.append({"url": repo_url}) time.sleep(delay) return self.results
def export_web_static(export_dir, base_url, client, force=False): """Export a running web interface to a folder. If the folder exists, the user must use force. This should be run via: rse export --type static-web [export_dir] If the user manually starts the server, the user needs to do: export RSE_DISABLE_ANNOTATE=True before the server is started to disable the annotation interface button. This will be fixed in a future PR to have an interface that submits an issue to do an annotation, but this needs to be developed first. Arguments: - export_dir (str) : the path to an export directory - base_url (str) : the base url of the server, including port - client (Encyclopedia) : the encyclopedia to use - force (bool) : if directory exists, overwrite """ print(f"Starting export for {base_url}") time.sleep(2) # Ensure that the server is running try: requests.head(base_url).status_code == 200 except: bot.info( "Please export after the server is running: export --type static-web [export_dir]" ) return # Output directory cannot exist if force if os.path.exists(export_dir) and not force: sys.exit(f"{export_dir} exists, use --force to overwrite.") # Create export directory if it doesn't exist if not os.path.exists(export_dir): os.mkdir(export_dir) # Copy static files static_files = os.path.join(export_dir, "static") if not os.path.exists(static_files): shutil.copytree(os.path.join(here, "static"), static_files) # Prepare urls (and filepath relative to export_dir) for export urls = {base_url: "index.html"} # Create static data export data = [] # Add repos and static annotation for repo in client.list(): repo = client.get(repo[0]) repo_path = os.path.join("repository", repo.uid) data.append({ "uid": repo.uid, "url": repo.url, "rel": "%s%s" % (RSE_URL_PREFIX, repo_path), "avatar": repo.avatar, "description": repo.description, }) # Currently don't link to repository page # urls["%s%s%s" % (base_url, RSE_URL_PREFIX, repo_path)] = os.path.join( # repo_path, "index.html" # ) # Static annotation endpoints for annotation_type in ["criteria", "taxonomy"]: urls["%s%s%s/annotate-%s" % (base_url, RSE_URL_PREFIX, repo_path, annotation_type)] = os.path.join( repo_path, "annotate-%s" % annotation_type, "index.html") # Repository API endpoints urls["%s%sapi/repos/%s" % (base_url, RSE_URL_PREFIX, repo.uid)] = os.path.join( "api", "repos", repo.uid, "index.json") # Add API endpoints urls["%s%sapi" % (base_url, RSE_URL_PREFIX)] = os.path.join( "api", "index.json") urls["%s%sapi/repos" % (base_url, RSE_URL_PREFIX)] = os.path.join( "api", "repos", "index.json") for parser in ["github", "gitlab"]: urls["%s%sapi/repos/parser/%s" % (base_url, RSE_URL_PREFIX, parser)] = os.path.join( "api", "repos", "parser", parser, "index.json") urls["%s%sapi/taxonomy" % (base_url, RSE_URL_PREFIX)] = os.path.join( "api", "taxonomy", "index.json") urls["%s%sapi/criteria" % (base_url, RSE_URL_PREFIX)] = os.path.join( "api", "criteria", "index.json") # Add search, criteria, and taxonomy for term in ["search", "criteria", "taxonomy"]: urls["%s%s%s" % (base_url, RSE_URL_PREFIX, term)] = os.path.join( term, "index.html") for url, outfile in urls.items(): # Skip if we've already created it if os.path.exists(outfile): continue # Update the output file with the repository outfile = os.path.join(export_dir, outfile) # Create nested output folder, if doesn't exist out_dir = os.path.dirname(outfile) if not os.path.exists(out_dir): mkdir_p(out_dir) # Url might have a prefix response = requests.get(url, headers={"User-Agent": get_user_agent()}) if response.status_code == 200: write_file(outfile, response.text) else: print(f"Issue parsing {url}") print("Generating data export") write_json(data, os.path.join(export_dir, "data.json")) print("Export is complete!")