Пример #1
0
    def set_content(self, text):
        metadata_dict = {}

        package = find_or_empty_string(r"Package: (.*)", text)
        title = find_or_empty_string(r"Title: (.*)", text)
        self.source_preview["title"] = build_source_preview(
            self.content_url, text, 'title', title)
        metadata_dict["title"] = "{}: {}".format(package, title)

        metadata_dict["author"] = self.find_authors(text)
        if metadata_dict["author"] != "":
            self.source_preview["author"] = build_author_source_preview(
                self.content_url, text, 'author', metadata_dict["author"])

        version = find_or_empty_string(r"Version: (.*)", text)
        metadata_dict["note"] = "R package version {}".format(version)
        metadata_dict["container-title"] = metadata_dict["note"]

        published_date = find_or_empty_string(r"Date/Publication: (.*)", text)
        if published_date:
            year = published_date[0:4]
            metadata_dict["year"] = year
            metadata_dict["issued"] = {"date-parts": [[year]]}
            self.source_preview["year"] = build_source_preview(
                self.content_url, text, 'year', published_date)

        metadata_dict["URL"] = "https://CRAN.R-project.org/package={}".format(
            package)
        metadata_dict["type"] = "Manual"
        self.content = metadata_dict
Пример #2
0
 def set_content_url(self, input):
     # print "set_content_url", input
     if input and "cran.r-project.org/web/packages" in input:
         package_name = find_or_empty_string(
             "cran.r-project.org/web/packages/(\w+\.?\w+)/?", input)
         self.content_url = "https://cran.r-project.org/web/packages/{}".format(
             package_name)
     elif input and "cran.r-project.org/package=" in input.lower():
         package_name = find_or_empty_string(
             "cran.r-project.org/package=(.*)/?", input)
         package_name = package_name.split("/")[0]
         self.content_url = "https://cran.r-project.org/web/packages/{}".format(
             package_name)
Пример #3
0
 def set_content(self, input):
     self.content = {}
     input = strip_new_lines(input)
     title = find_or_empty_string("<title.*?>(.+?)</title>", input)
     if not title:
         title = find_or_empty_string("<h1>(.+?)</h1>", input)
     if not title:
         title = find_or_empty_string("<h2>(.+?)</h2>", input)
     self.content["type"] = "misc"
     self.content["title"] = title.lstrip(" ").rstrip(" ")
     self.content["URL"] = self.content_url
     self.source_preview["title"] = build_source_preview(
         self.content_url, input, 'title', title)
Пример #4
0
    def extract_doi(self, text):
        if text.startswith('https://zenodo.org/record/'):
            text = get_webpage_text(text)

        badge_doi = find_or_empty_string("://zenodo.org/badge/doi/(.+?).svg",
                                         text)
        if badge_doi:
            return self.strip_junk_from_end_of_doi(badge_doi)
        zenodo_doi = find_or_empty_string("10.5281/zenodo\.\d+", text)
        if zenodo_doi:
            return self.strip_junk_from_end_of_doi(zenodo_doi)

        possible_dois = re.findall("10.\d{4,9}/[-._;()/:A-Z0-9+]+", text,
                                   re.IGNORECASE | re.MULTILINE)
        for doi in possible_dois:
            if "10.5063/schema/codemeta-2.0" in doi.lower():
                pass
            else:
                print("HERE I AM", doi)
                return self.strip_junk_from_end_of_doi(doi)
Пример #5
0
    def set_content(self, input):
        if not "github.com" in input:
            return
        if input.startswith("http"):
            url = "/".join(input.split("/", 5)[0:5])
        else:
            url = find_or_empty_string('\"(https?://github.com/.+?)\"', input)
            url = url.replace("/issues", "")
            url = url.replace("/new", "")
            if 'sphinx' and 'theme' in url or url.endswith('.zip'):
                url = None
            if not url:
                return

        self.content = get_webpage_text(url)
        self.content_url = url
Пример #6
0
    def set_content(self, input):
        if not "bitbucket.org" in input:
            return
        if input.startswith("http"):
            url = "/".join(input.split("/", 5)[0:5])
            url = url + '/src'
        else:
            url = find_or_empty_string(
                '"(https?:\/\/bitbucket.org\/\w+\/\w+/?)"', input)
            if not url:
                return
            else:
                url = "/".join(url.split("/")[0:5])
                url = url + '/src'

        self.content = get_webpage_text(url)
        self.content_url = url
Пример #7
0
 def get_repo_api_url(github_url):
     # remove /wiki
     repo_api_url = github_url.replace("/wiki", "")
     # strip trailing /
     if repo_api_url.endswith("/"):
         repo_api_url = repo_api_url[:-1]
     # remove www
     repo_api_url = repo_api_url.replace("http://www.", "http://")
     repo_api_url = repo_api_url.replace("https://www.", "https://")
     # switch to API URL
     if "gist.github.com" in repo_api_url:
         gist_id = find_or_empty_string("gist.github.com\/\w+\/(\w+|\d+)",
                                        repo_api_url)
         repo_api_url = "https://api.github.com/gists/{}".format(gist_id)
     else:
         repo_api_url = repo_api_url.replace("github.com/",
                                             "api.github.com/repos/")
     return repo_api_url
Пример #8
0
    def find_authors_method_2(text):
        # format 'Author: Krzysztof Byrski [aut, cre], Przemyslaw Spurek [ctb]'
        authors = []
        raw_authors = find_or_empty_string("Author: (.*)", text)
        roles = re.findall("\[\w+,?\s?\w+\,?\s?\w+]", raw_authors)
        if roles:
            for role in roles:
                raw_authors = raw_authors.replace(role, '')

        names = raw_authors.split(',')
        if roles:
            for name, role in zip(names, roles):
                if 'aut' in role or 'cre' in role:
                    name = name.split('<')[0].strip()  # remove email addresses
                    authors.append(author_name_as_dict(name))
        else:
            for name in names:
                name = name.split('<')[0].strip()  # remove email addresses
                authors.append(author_name_as_dict(name))
        return authors
Пример #9
0
    def set_content(self, input):
        data = json5.loads(input)
        if "citation" in data:
            data = data["citation"]
        if data:
            code_meta_exists = True
        self.content = {}

        if data.get("id"):
            self.content["doi"] = find_or_empty_string(
                "zenodo\.org\/record\/(\d+)", data["id"])
        elif data.get("identifier"):
            self.content["doi"] = clean_doi(data["identifier"],
                                            code_meta_exists)
        else:
            self.content["doi"] = None

        if self.content["doi"]:
            doi_url = "https://doi.org/{}".format(self.content["doi"])
            self.content["URL"] = doi_url
        else:
            if "codeRepository" in data:
                self.content["URL"] = data["codeRepository"]
                self.content["repo"] = data["codeRepository"]
            elif "url" in data:
                self.content["URL"] = data["url"]
                self.content["repo"] = data["url"]

        if "name" in data:
            self.content["title"] = data["name"]

        if "title" in data:
            self.content["title"] = data["title"]

        self.content["author"] = []
        if "author" in data:
            if type(data["author"]) is dict:
                author = data["author"]
                self.content["author"].append(
                    author_name_as_dict('{} {}'.format(author["givenName"],
                                                       author["familyName"])))
            elif type(data["author"]) is list:
                authors = data["author"]
                for author in authors:
                    try:
                        self.content["author"].append(
                            author_name_as_dict('{} {}'.format(
                                author["givenName"], author["familyName"])))
                    except UnicodeEncodeError:
                        continue

        if "agents" in data:
            if isinstance(data["agents"], dict):
                agents = [data["agents"]]
            else:
                agents = data["agents"]
            for agent in agents:
                self.content["author"].append(
                    author_name_as_dict(data["agents"]["name"]))

        if "dateCreated" in data:
            self.content["issued"] = {
                "date-parts": [[data["dateCreated"][0:4]]]
            }

        if "version" in data:
            self.content["version"] = data["version"]

        self.content["type"] = "software"
Пример #10
0
 def strip_dependencies(readme_text):
     readme_text = strip_new_lines(readme_text)
     dependencies = find_or_empty_string('# Dependencies #(.+)#?.+#?',
                                         readme_text)
     readme_text = readme_text.replace(dependencies, '')
     return readme_text
Пример #11
0
    def set_content(self, citentry_content):
        self.content = {}
        self.content["title"] = find_or_empty_string("title\s*=\s*\"(.*?)\"",
                                                     citentry_content)
        self.content["URL"] = find_or_empty_string("url\s*=\s*\"(.*?)\"",
                                                   citentry_content)
        self.content["volume"] = find_or_empty_string("volume\s*=\s*\"(.*?)\"",
                                                      citentry_content)
        self.content["number"] = find_or_empty_string("number\s*=\s*\"(.*?)\"",
                                                      citentry_content)
        self.content["pages"] = find_or_empty_string("pages\s*=\s*\"(.*?)\"",
                                                     citentry_content)
        self.content["publisher"] = find_or_empty_string(
            "publisher\s*=\s*\"(.*?)\"", citentry_content)
        self.content["isbn"] = find_or_empty_string("isbn\s*=\s*\"(.*?)\"",
                                                    citentry_content)
        self.content["container-title"] = find_or_empty_string(
            "journal\s*=\s*\"(.*?)\"", citentry_content)

        self.content["year"] = find_or_empty_string("year\s*=\s*\"(.*?)\"",
                                                    citentry_content)
        if self.content["year"]:
            self.content["issued"] = {"date-parts": [[self.content["year"]]]}
        self.content["type"] = find_or_empty_string("entry\s*=\s*\"(.*?)\"",
                                                    citentry_content)

        self.content["author"] = []
        first_author = find_or_empty_string("author\s*=.*?\"(.*?)\"",
                                            citentry_content)
        if first_author:
            self.content["author"].append(author_name_as_dict(first_author))