示例#1
0
    def scrape(self, url, paginate=False, delay=0.0):
        """A shared function to scrape a set of repositories. Since the JoSS
        pages for a search and the base are the same, we can use a shared
        function.
        """
        # Api doesn't appear to have pagination
        response = requests.get(url, headers={"User-Agent": get_user_agent()})
        data = check_response(response)

        for entry in data.get("response", {}).get("docs", []):
            page_url = entry["uri_s"]
            response = requests.get(page_url,
                                    headers={"User-Agent": get_user_agent()})
            repo_url = None
            if response.status_code == 200:
                match = re.search(repository_regex, response.text,
                                  re.IGNORECASE)
                if match:
                    repo_url = match.group()

            if repo_url:
                bot.info("Found repository: %s" % repo_url)
                self.results.append(repo_url)
            time.sleep(delay)

        return self.results
示例#2
0
文件: joss.py 项目: rseng/rse
    def scrape(self, url, paginate=False, delay=None):
        """A shared function to scrape a set of repositories. Since the JoSS
        pages for a search and the base are the same, we can use a shared
        function.
        """
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            sys.exit("BeautifulSoup is required. pip install rse[scraper].")

        # Handle pagination
        while url is not None:

            response = requests.get(url,
                                    headers={"User-Agent": get_user_agent()})
            soup = BeautifulSoup(response.text, "html.parser")
            url = None
            for link in soup.find_all("link", href=True):

                # Sleep for a random amount of time to give a rest!
                sleep(delay or random.choice(range(1, 10)) * 0.1)
                paper_url = link.attrs.get("href", "")

                # If we don't have the next page yet
                if link.attrs.get(
                        "rel") is not None and url is None and paginate:
                    if link.attrs.get("rel")[0] == "next":
                        url = link.attrs.get("href")

                # Retrieve page with paper metadata that we need
                if re.search(
                        "https://joss.theoj.org/papers/10.[0-9]{5}/joss.[0-9]{5}",
                        paper_url):
                    response = requests.get(
                        paper_url, headers={"User-Agent": get_user_agent()})
                    paper_soup = BeautifulSoup(response.text, "html5lib")

                    # Find links that we need
                    repo = {}
                    for link in paper_soup.find_all("a", href=True):
                        if "Software repository" in link.text:
                            repo["url"] = link.attrs.get("href", "")
                        elif "Software archive" in link.text:
                            repo["doi"] = link.attrs.get("href", "")

                    if repo.get("url") and repo.get("doi"):
                        bot.info("Found repository: %s" % repo["url"])
                        self.results.append(repo)

        return self.results
示例#3
0
文件: biotools.py 项目: rseng/rse
    def scrape(self, url, paginate=False, delay=None):
        """A shared function to scrape a set of repositories. Since the JoSS
        pages for a search and the base are the same, we can use a shared
        function.
        """
        # Handle pagination
        original_url = url
        while url is not None:

            response = requests.get(url, headers={"User-Agent": get_user_agent()})
            data = check_response(response)

            # Reset the url to be None
            url = None
            if data.get("next") and paginate:
                url = original_url + "&page=%s" % data.get("next", "").replace(
                    "?page=", ""
                )

            for entry in data.get("list", []):

                # Look for GitHub / GitLab URL
                repo = {}
                for link in entry.get("link", []):
                    if "Repository" in link["type"] and re.search(
                        repository_regex, link["url"], re.IGNORECASE
                    ):
                        repo["url"] = link["url"]

                # If we don't have a repository, search the homepage
                if not repo.get("url") and re.search(
                    repository_regex, entry["homepage"]
                ):
                    repo["url"] = entry["homepage"]

                # We must have a repository url to parse
                if not repo.get("url"):
                    continue

                # Look for a doi
                for pub in entry["publication"]:
                    if pub.get("doi"):
                        repo["doi"] = pub.get("doi")

                bot.info("Found repository: %s" % repo["url"])
                self.results.append(repo)

                # Sleep for a random amount of time to give a rest!
                sleep(delay or random.choice(range(1, 10)) * 0.1)

        return self.results
示例#4
0
文件: rsnl.py 项目: untzag/rse
    def scrape(self, url, paginate=False, delay=0.0):
        """A shared function to scrape a set of repositories. Since the JoSS
           pages for a search and the base are the same, we can use a shared
           function.
        """
        response = requests.get(url, headers={"User-Agent": get_user_agent()})
        data = check_response(response) or []

        for entry in data:

            # I only see GitHub urls
            repo_url = entry.get("repositoryURLs", {}).get("github")
            repo_url = repo_url[0] if repo_url else None
            doi = entry.get("conceptDOI")
            doi = doi if doi and "FIXME" not in doi else None
            if repo_url and doi:
                bot.info("Found repository: %s" % repo_url)
                self.results.append({"url": repo_url, "doi": doi})
            elif repo_url:
                bot.info("Found repository: %s" % repo_url)
                self.results.append({"url": repo_url})
            time.sleep(delay)

        return self.results
示例#5
0
def export_web_static(export_dir, base_url, client, force=False):
    """Export a running web interface to a folder. If the folder exists, the
    user must use force. This should be run via:
        rse export --type static-web [export_dir]
    If the user manually starts the server, the user needs to do:
        export RSE_DISABLE_ANNOTATE=True before the server is started
    to disable the annotation interface button. This will be fixed
    in a future PR to have an interface that submits an issue to do
    an annotation, but this needs to be developed first.

    Arguments:
     - export_dir (str)      : the path to an export directory
     - base_url (str)        : the base url of the server, including port
     - client (Encyclopedia) : the encyclopedia to use
     - force (bool)          : if directory exists, overwrite
    """
    print(f"Starting export for {base_url}")
    time.sleep(2)

    # Ensure that the server is running
    try:
        requests.head(base_url).status_code == 200
    except:
        bot.info(
            "Please export after the server is running: export --type static-web [export_dir]"
        )
        return

    # Output directory cannot exist if force
    if os.path.exists(export_dir) and not force:
        sys.exit(f"{export_dir} exists, use --force to overwrite.")

    # Create export directory if it doesn't exist
    if not os.path.exists(export_dir):
        os.mkdir(export_dir)

    # Copy static files
    static_files = os.path.join(export_dir, "static")
    if not os.path.exists(static_files):
        shutil.copytree(os.path.join(here, "static"), static_files)

    # Prepare urls (and filepath relative to export_dir) for export
    urls = {base_url: "index.html"}

    # Create static data export
    data = []

    # Add repos and static annotation
    for repo in client.list():
        repo = client.get(repo[0])
        repo_path = os.path.join("repository", repo.uid)
        data.append({
            "uid": repo.uid,
            "url": repo.url,
            "rel": "%s%s" % (RSE_URL_PREFIX, repo_path),
            "avatar": repo.avatar,
            "description": repo.description,
        })
        # Currently don't link to repository page
        # urls["%s%s%s" % (base_url, RSE_URL_PREFIX, repo_path)] = os.path.join(
        #    repo_path, "index.html"
        # )

        # Static annotation endpoints
        for annotation_type in ["criteria", "taxonomy"]:
            urls["%s%s%s/annotate-%s" % (base_url, RSE_URL_PREFIX, repo_path,
                                         annotation_type)] = os.path.join(
                                             repo_path,
                                             "annotate-%s" % annotation_type,
                                             "index.html")

        # Repository API endpoints
        urls["%s%sapi/repos/%s" %
             (base_url, RSE_URL_PREFIX, repo.uid)] = os.path.join(
                 "api", "repos", repo.uid, "index.json")

    # Add API endpoints
    urls["%s%sapi" % (base_url, RSE_URL_PREFIX)] = os.path.join(
        "api", "index.json")
    urls["%s%sapi/repos" % (base_url, RSE_URL_PREFIX)] = os.path.join(
        "api", "repos", "index.json")

    for parser in ["github", "gitlab"]:
        urls["%s%sapi/repos/parser/%s" %
             (base_url, RSE_URL_PREFIX, parser)] = os.path.join(
                 "api", "repos", "parser", parser, "index.json")

    urls["%s%sapi/taxonomy" % (base_url, RSE_URL_PREFIX)] = os.path.join(
        "api", "taxonomy", "index.json")
    urls["%s%sapi/criteria" % (base_url, RSE_URL_PREFIX)] = os.path.join(
        "api", "criteria", "index.json")

    # Add search, criteria, and taxonomy
    for term in ["search", "criteria", "taxonomy"]:
        urls["%s%s%s" % (base_url, RSE_URL_PREFIX, term)] = os.path.join(
            term, "index.html")

    for url, outfile in urls.items():

        # Skip if we've already created it
        if os.path.exists(outfile):
            continue

        # Update the output file with the repository
        outfile = os.path.join(export_dir, outfile)

        # Create nested output folder, if doesn't exist
        out_dir = os.path.dirname(outfile)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        # Url might have a prefix
        response = requests.get(url, headers={"User-Agent": get_user_agent()})
        if response.status_code == 200:
            write_file(outfile, response.text)
        else:
            print(f"Issue parsing {url}")

    print("Generating data export")
    write_json(data, os.path.join(export_dir, "data.json"))
    print("Export is complete!")