def set_content(self, text): metadata_dict = {} package = find_or_empty_string(r"Package: (.*)", text) title = find_or_empty_string(r"Title: (.*)", text) self.source_preview["title"] = build_source_preview( self.content_url, text, 'title', title) metadata_dict["title"] = "{}: {}".format(package, title) metadata_dict["author"] = self.find_authors(text) if metadata_dict["author"] != "": self.source_preview["author"] = build_author_source_preview( self.content_url, text, 'author', metadata_dict["author"]) version = find_or_empty_string(r"Version: (.*)", text) metadata_dict["note"] = "R package version {}".format(version) metadata_dict["container-title"] = metadata_dict["note"] published_date = find_or_empty_string(r"Date/Publication: (.*)", text) if published_date: year = published_date[0:4] metadata_dict["year"] = year metadata_dict["issued"] = {"date-parts": [[year]]} self.source_preview["year"] = build_source_preview( self.content_url, text, 'year', published_date) metadata_dict["URL"] = "https://CRAN.R-project.org/package={}".format( package) metadata_dict["type"] = "Manual" self.content = metadata_dict
def set_content_url(self, input): # print "set_content_url", input if input and "cran.r-project.org/web/packages" in input: package_name = find_or_empty_string( "cran.r-project.org/web/packages/(\w+\.?\w+)/?", input) self.content_url = "https://cran.r-project.org/web/packages/{}".format( package_name) elif input and "cran.r-project.org/package=" in input.lower(): package_name = find_or_empty_string( "cran.r-project.org/package=(.*)/?", input) package_name = package_name.split("/")[0] self.content_url = "https://cran.r-project.org/web/packages/{}".format( package_name)
def set_content(self, input): self.content = {} input = strip_new_lines(input) title = find_or_empty_string("<title.*?>(.+?)</title>", input) if not title: title = find_or_empty_string("<h1>(.+?)</h1>", input) if not title: title = find_or_empty_string("<h2>(.+?)</h2>", input) self.content["type"] = "misc" self.content["title"] = title.lstrip(" ").rstrip(" ") self.content["URL"] = self.content_url self.source_preview["title"] = build_source_preview( self.content_url, input, 'title', title)
def extract_doi(self, text): if text.startswith('https://zenodo.org/record/'): text = get_webpage_text(text) badge_doi = find_or_empty_string("://zenodo.org/badge/doi/(.+?).svg", text) if badge_doi: return self.strip_junk_from_end_of_doi(badge_doi) zenodo_doi = find_or_empty_string("10.5281/zenodo\.\d+", text) if zenodo_doi: return self.strip_junk_from_end_of_doi(zenodo_doi) possible_dois = re.findall("10.\d{4,9}/[-._;()/:A-Z0-9+]+", text, re.IGNORECASE | re.MULTILINE) for doi in possible_dois: if "10.5063/schema/codemeta-2.0" in doi.lower(): pass else: print("HERE I AM", doi) return self.strip_junk_from_end_of_doi(doi)
def set_content(self, input): if not "github.com" in input: return if input.startswith("http"): url = "/".join(input.split("/", 5)[0:5]) else: url = find_or_empty_string('\"(https?://github.com/.+?)\"', input) url = url.replace("/issues", "") url = url.replace("/new", "") if 'sphinx' and 'theme' in url or url.endswith('.zip'): url = None if not url: return self.content = get_webpage_text(url) self.content_url = url
def set_content(self, input): if not "bitbucket.org" in input: return if input.startswith("http"): url = "/".join(input.split("/", 5)[0:5]) url = url + '/src' else: url = find_or_empty_string( '"(https?:\/\/bitbucket.org\/\w+\/\w+/?)"', input) if not url: return else: url = "/".join(url.split("/")[0:5]) url = url + '/src' self.content = get_webpage_text(url) self.content_url = url
def get_repo_api_url(github_url): # remove /wiki repo_api_url = github_url.replace("/wiki", "") # strip trailing / if repo_api_url.endswith("/"): repo_api_url = repo_api_url[:-1] # remove www repo_api_url = repo_api_url.replace("http://www.", "http://") repo_api_url = repo_api_url.replace("https://www.", "https://") # switch to API URL if "gist.github.com" in repo_api_url: gist_id = find_or_empty_string("gist.github.com\/\w+\/(\w+|\d+)", repo_api_url) repo_api_url = "https://api.github.com/gists/{}".format(gist_id) else: repo_api_url = repo_api_url.replace("github.com/", "api.github.com/repos/") return repo_api_url
def find_authors_method_2(text): # format 'Author: Krzysztof Byrski [aut, cre], Przemyslaw Spurek [ctb]' authors = [] raw_authors = find_or_empty_string("Author: (.*)", text) roles = re.findall("\[\w+,?\s?\w+\,?\s?\w+]", raw_authors) if roles: for role in roles: raw_authors = raw_authors.replace(role, '') names = raw_authors.split(',') if roles: for name, role in zip(names, roles): if 'aut' in role or 'cre' in role: name = name.split('<')[0].strip() # remove email addresses authors.append(author_name_as_dict(name)) else: for name in names: name = name.split('<')[0].strip() # remove email addresses authors.append(author_name_as_dict(name)) return authors
def set_content(self, input): data = json5.loads(input) if "citation" in data: data = data["citation"] if data: code_meta_exists = True self.content = {} if data.get("id"): self.content["doi"] = find_or_empty_string( "zenodo\.org\/record\/(\d+)", data["id"]) elif data.get("identifier"): self.content["doi"] = clean_doi(data["identifier"], code_meta_exists) else: self.content["doi"] = None if self.content["doi"]: doi_url = "https://doi.org/{}".format(self.content["doi"]) self.content["URL"] = doi_url else: if "codeRepository" in data: self.content["URL"] = data["codeRepository"] self.content["repo"] = data["codeRepository"] elif "url" in data: self.content["URL"] = data["url"] self.content["repo"] = data["url"] if "name" in data: self.content["title"] = data["name"] if "title" in data: self.content["title"] = data["title"] self.content["author"] = [] if "author" in data: if type(data["author"]) is dict: author = data["author"] self.content["author"].append( author_name_as_dict('{} {}'.format(author["givenName"], author["familyName"]))) elif type(data["author"]) is list: authors = data["author"] for author in authors: try: self.content["author"].append( author_name_as_dict('{} {}'.format( author["givenName"], author["familyName"]))) except UnicodeEncodeError: continue if "agents" in data: if isinstance(data["agents"], dict): agents = [data["agents"]] else: agents = data["agents"] for agent in agents: self.content["author"].append( author_name_as_dict(data["agents"]["name"])) if "dateCreated" in data: self.content["issued"] = { "date-parts": [[data["dateCreated"][0:4]]] } if "version" in data: self.content["version"] = data["version"] self.content["type"] = "software"
def strip_dependencies(readme_text): readme_text = strip_new_lines(readme_text) dependencies = find_or_empty_string('# Dependencies #(.+)#?.+#?', readme_text) readme_text = readme_text.replace(dependencies, '') return readme_text
def set_content(self, citentry_content): self.content = {} self.content["title"] = find_or_empty_string("title\s*=\s*\"(.*?)\"", citentry_content) self.content["URL"] = find_or_empty_string("url\s*=\s*\"(.*?)\"", citentry_content) self.content["volume"] = find_or_empty_string("volume\s*=\s*\"(.*?)\"", citentry_content) self.content["number"] = find_or_empty_string("number\s*=\s*\"(.*?)\"", citentry_content) self.content["pages"] = find_or_empty_string("pages\s*=\s*\"(.*?)\"", citentry_content) self.content["publisher"] = find_or_empty_string( "publisher\s*=\s*\"(.*?)\"", citentry_content) self.content["isbn"] = find_or_empty_string("isbn\s*=\s*\"(.*?)\"", citentry_content) self.content["container-title"] = find_or_empty_string( "journal\s*=\s*\"(.*?)\"", citentry_content) self.content["year"] = find_or_empty_string("year\s*=\s*\"(.*?)\"", citentry_content) if self.content["year"]: self.content["issued"] = {"date-parts": [[self.content["year"]]]} self.content["type"] = find_or_empty_string("entry\s*=\s*\"(.*?)\"", citentry_content) self.content["author"] = [] first_author = find_or_empty_string("author\s*=.*?\"(.*?)\"", citentry_content) if first_author: self.content["author"].append(author_name_as_dict(first_author))