def set_content(self, github_main_page_text): matches = re.findall("href=\"(.*blob/.*/citation.*?)\"", github_main_page_text, re.IGNORECASE) if not matches: matches = re.findall("href=\"(.*/inst)\"", github_main_page_text, re.IGNORECASE) if matches: inst_url = "http://github.com{}".format(matches[0]) r = requests.get(inst_url) inst_page_text = r.text matches = re.findall("href=\"(.*blob/.*/citation.*?)\"", inst_page_text, re.IGNORECASE) if matches: filename_part = matches[0] filename_part = filename_part.replace("/blob", "") filename_part = filename_part.replace("https://github.com", "") filename_part = filename_part.replace("http://github.com", "") filename = "https://raw.githubusercontent.com{}".format( filename_part) # check if symlink decoded_content = self.get_symlink_content(matches) if decoded_content: self.content = decoded_content else: self.content = get_webpage_text(filename) self.content_url = filename
def set_content(self, input): if self.content_url.startswith(("http://", "https://")): relation_link = self.check_for_rel_cite_as_header(self.content_url) if relation_link: self.content_url = relation_link if 'doi.org' in relation_link: self.content = 'found' else: return get_webpage_text(relation_link)
def set_content(self, bitbucket_main_page_text): matches = re.findall('href=\"(.*\/readme.*?\?.*)\"', bitbucket_main_page_text, re.IGNORECASE) if matches: filename_part = matches[0] filename = get_raw_bitbucket_url(filename_part) self.content = get_webpage_text(filename) self.content_url = filename
def set_content(self, github_main_page_text): matches = re.findall("href=\"(.*blob/.*/description.*?)\"", github_main_page_text, re.IGNORECASE) if matches: filename_part = matches[0] filename_part = filename_part.replace("/blob", "") filename = "https://raw.githubusercontent.com{}".format( filename_part) self.content = get_webpage_text(filename) self.content_url = filename
def set_content(self, input): self.set_content_url(input) if self.content_url: page = get_webpage_text(self.content_url) # get rid of the header because it has pypi specific stuff, not stuff about the library # makes it hard to get github links out for the library # see for example https://pypi.python.org/pypi/executor if '<div id="content-body">' in page: page = page.split('<div id="content-body">')[1] self.content = page
def set_content(self, bitbucket_main_page_text): found_match = False matches = re.findall('href=\"(.*\/description.*?)\"', bitbucket_main_page_text, re.IGNORECASE) if matches: filename_part = matches[0] filename = get_raw_bitbucket_url(filename_part) self.content = get_webpage_text(filename) self.content_url = filename
def set_content(self, input): if not "github.com" in input: return if input.startswith("http"): url = "/".join(input.split("/", 5)[0:5]) else: url = find_or_empty_string('\"(https?://github.com/.+?)\"', input) url = url.replace("/issues", "") url = url.replace("/new", "") if 'sphinx' and 'theme' in url or url.endswith('.zip'): url = None if not url: return self.content = get_webpage_text(url) self.content_url = url
def set_content(self, input): if not "bitbucket.org" in input: return if input.startswith("http"): url = "/".join(input.split("/", 5)[0:5]) url = url + '/src' else: url = find_or_empty_string( '"(https?:\/\/bitbucket.org\/\w+\/\w+/?)"', input) if not url: return else: url = "/".join(url.split("/")[0:5]) url = url + '/src' self.content = get_webpage_text(url) self.content_url = url
def extract_doi(self, text): if text.startswith('https://zenodo.org/record/'): text = get_webpage_text(text) badge_doi = find_or_empty_string("://zenodo.org/badge/doi/(.+?).svg", text) if badge_doi: return self.strip_junk_from_end_of_doi(badge_doi) zenodo_doi = find_or_empty_string("10.5281/zenodo\.\d+", text) if zenodo_doi: return self.strip_junk_from_end_of_doi(zenodo_doi) possible_dois = re.findall("10.\d{4,9}/[-._;()/:A-Z0-9+]+", text, re.IGNORECASE | re.MULTILINE) for doi in possible_dois: if "10.5063/schema/codemeta-2.0" in doi.lower(): pass else: print("HERE I AM", doi) return self.strip_junk_from_end_of_doi(doi)
def set_content(self, input): filename = self.parent_content_url + '/DESCRIPTION' page = get_webpage_text(filename) self.content = page self.content_url = filename
def set_content(self, input): if self.content_url: self.content = get_webpage_text(self.content_url)