def get_latest_from_rss(): """ Return list of (name, version, released_on) of the latest versions of packages added to the index. """ # NB: limit=50 is hardcoded upstream rss = fetch_page("http://registry.npmjs.org/-/rss?descending=true&limit=50", decode=False) soup = BeautifulSoup(rss) result = [] for item in soup.findAll("item"): try: pub_date = item.pubdate.text (name, version) = item.title.text.split("@") # NB: <pubDate>2012-06-27T20:29:22.550Z</pubDate> # Python doesn't have format string for miliseconds so strip it out released_on = datetime.strptime(pub_date.split('.')[0], '%Y-%m-%dT%H:%M:%S') result.append((name, version, released_on)) except: continue return result
def get_tags(url, user_repo=None, extended=False, last_modified=None): """ Return all GitHub tags for this package. @url - string - either git checkout url or http url for the GitHub page """ if user_repo is None: user_repo = _get_user_repo(url) if not user_repo: raise Exception("GitHub - get_tags - can't find repository for %s" % url) api_url = 'https://api.github.com/repos/%s/tags' if settings_imported: api_url += '?client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET) json_data = fetch_page(api_url % user_repo, last_modified=last_modified) if json_data is None: return None data = json.loads(json_data) if extended: return data else: try: result = {} for tag in data: result[tag['name']] = tag['commit']['sha'] return result except: # in case GitHub API limit was reached then above indexing will fail return {}
def search_tvshow(self, search_query): logging.info('Searching tvshow for query: {}'.format(search_query)) search_url = "/search/tv-shows/" + quote_plus(search_query) search_page = fetch_page(self.site_url, extra_path=search_url) pq = PyQuery(search_page) dom_search_list = pq(u".list_item") tvshow_list = [] for dom_item in dom_search_list: title = pq(dom_item).find('img[border="0"]').show().attr('alt') href = pq(dom_item).find('a.panel').attr('href') url = urljoin(self.site_url, href) category = Media.TVSHOW # Since it is a tvshow we need to fetch the children episodes tvshow = Media(title=title, url=url, category=category, has_children=True) # set description desc = pq(dom_item).find('.plot').text() tvshow.description = re.sub('\s', ' ', desc) # remove newlines from description # set rating tvshow.rating = pq(dom_item).find('span.rank_value').text() # set thumbnail url href_thumbnail = pq(dom_item).find('img[border="0"]').show().attr('src') tvshow.thumbnail = urljoin(self.site_url, href_thumbnail) tvshow_list.append(tvshow) return tvshow_list
def get_url(package, version=None): """ Return homepage, repo, bugtracker URLs for a package """ urls = { 'homepage' : '', 'repository' : '', 'bugtracker' : '', } json_data = fetch_page('https://rubygems.org/api/v1/gems/%s.json' % package) # json_data = json_data.decode('UTF-8') data = json.loads(json_data) if data.has_key('homepage_uri'): urls['homepage'] = data['homepage_uri'] else: urls['homepage'] = data['project_uri'] if data.has_key('bug_tracker_uri') and data['bug_tracker_uri']: urls['bugtracker'] = data['bug_tracker_uri'] if data.has_key('source_code_uri') and data['source_code_uri']: urls['repository'] = data['source_code_uri'] return urls
def get_latest_from_rss(): """ @return - list of (name. version, released_on) """ data = fetch_page("https://rubygems.org/api/v1/activity/just_updated.json") latest = json.loads(data) result = [] for gem in latest: # NB: not implemented # see https://github.com/rubygems/rubygems.org/issues/536 # if gem['prerelease']: # continue # don't add prerelease software (latest_ver, released_on) = get_latest(gem['name']) #todo: this JSON give more info like GitHub URLs import from here # and kill some messages if latest_ver == gem['version']: # RubyGems.org doesn't provide date of release result.append((gem['name'], gem['version'], released_on)) return result
def get_latest(package, last_checked=None): """ Get the latest version of a package """ package = _other_name(package) package = package.replace('::', '-') json_data = fetch_page('http://api.metacpan.org/v0/release/%s' % package, last_modified=last_checked) if json_data is None: # NB: empty string is not None but will fail the check return 304, 304 data = json.loads(json_data) # don't process DEV/TRIAL packages # devel versions for some packages like Moose # mix up with regular versions if data['maturity'] != "released": return None, None version = None if data.has_key('version'): version = data['version'] elif data.has_key('version_numified'): version = data['version_numified'] released_on = get_release_date(package, version, data) return version, released_on
def get_latest_from_rss(): """ @return - list of (name, version, released_on) """ rss = fetch_page("https://metacpan.org/feed/recent?f=l") # filter=latest dom = parseString(rss) result = [] for item in dom.getElementsByTagName("item"): try: # titles are in the form Dist-Zilla-Plugin-Test-PodSpelling-2.002005 title_parts = item.getElementsByTagName("title")[0].firstChild.wholeText.split("-") version = title_parts[-1] # version is always the last component # skip DEV versions #todo: better match, this is of the form MAJOR.MINOR_DEV if version.find('_') > -1: continue name = '::'.join(title_parts[:-1]) pub_date = item.getElementsByTagName("dc:date")[0] released_on = datetime.strptime(pub_date.firstChild.wholeText, '%Y-%m-%dT%H:%M:%SZ') result.append((name, version, released_on)) except: continue return result
def get_download_url(package, version, data = None): """ Return URL to download the package. """ # Twisted packages are many and follow a pattern but not hosted on PyPI if package in _twisted_mappings.keys(): sub_package = package.split('-')[1] main_ver = '.'.join(version.split('.')[:2]) return 'http://twistedmatrix.com/Releases/%s/%s/Twisted%s-%s.tar.bz2' % (sub_package, main_ver, sub_package, version) pkg_name = _other_name(package) if not data: data = fetch_page("https://pypi.python.org/pypi/%s/%s/json" % (pkg_name, version)) data = json.loads(data) if data.has_key('urls'): for file in data['urls']: # consider only source packages if file['packagetype'] == 'sdist': return file['url'] if data.has_key('info') and data['info'].has_key('download_url'): url = data['info']['download_url'] for ext in SUPPORTED_ARCHIVES: if url.endswith(ext): return url return None
def get_latest(package, last_checked=None): """ Get the latest version of a package @return - version, released_on """ pkg_name = _other_name(package) # fetch JSON only using package name. Will fetch latest version versions = fetch_page("https://pypi.python.org/pypi/%s/json" % pkg_name, last_modified=last_checked) if versions is None: # NB: empty string is not None but will fail the check return 304, 304 versions = json.loads(versions) if (not versions): logger.error("Can't find latest version - %s" % package) return "", None latest_ver = versions['info']['version'] release_date = get_release_date(pkg_name, latest_ver, versions) return latest_ver, release_date
def get_latest_from_rss(): """ Return list of (name, version, released_on) of the latest versions of packages added to the index. NB: See https://getsatisfaction.com/sonatype/topics/rss_feeds_for_artifact_group_updates looks like this feature is not well maintained or publicly advertised. """ # 404 # rss = fetch_page("http://search.maven.org/remotecontent?filepath=rss.xml") rss = fetch_page("http://repo1.maven.org/maven2/rss.xml") dom = parseString(rss) result = [] for item in dom.getElementsByTagName("item"): try: title = item.getElementsByTagName("title")[0] pub_date = item.getElementsByTagName("pubDate")[0] (gid, aid, version) = title.firstChild.wholeText.split(":") released_on = datetime.strptime(pub_date.firstChild.wholeText, '%a, %d %b %Y %H:%M:%S -0500') result.append(("%s:%s" % (gid, aid), version, released_on)) except: continue return result
def get_latest(package, last_checked=None): """ Get the latest version of a package @return - version, released_on """ [groupid, artifactid] = _groupid_artifactid(package) data = fetch_page('http://search.maven.org/solrsearch/select?q=g:"%s"+AND+a:"%s"&wt=json&core=gav' % (groupid, artifactid), last_modified=last_checked) if data is None: # NB: empty string is not None but will fail the check return 304, 304 data = json.loads(data) latest_ver = None latest_timestamp = 0 for release in data['response']['docs']: if release['timestamp'] > latest_timestamp: latest_timestamp = release['timestamp'] latest_ver = release['v'] released_on = datetime.fromtimestamp(latest_timestamp/1000) return latest_ver, released_on
def get_streams(self, media): logging.info("Extracting streams for: {}".format(media)) if not media.url: logging.warn("{} has no url".format(media)) return None pq = PyQuery(fetch_page(media.url)) stream_list = [] for href in (a.attrib.get('href') for a in pq('#links_list .link a:not([class])')): # clean whitespace and quotes from string href = href.replace(" ", "").replace("'", "").replace('"', "") try: video_host = re.search(r'\((?P<vid>.*?),(?:.*?),(?P<host>.*?)\);', href).group('host') video_id = re.search(r'\((?P<vid>.*?),(?:.*?),(?P<host>.*?)\);', href).group('vid') video_url = None stream = Stream(video_id, video_host, video_url) logging.info("Retrieved: {}".format(stream)) stream_list.append(stream) except AttributeError: # if an exception occured the href_rgx couldn't match something on href logging.error("Couldn't get video info from: {}".format(href)) pass return stream_list
def search_film(self, search_query): logging.info('Searching film for query: {}'.format(search_query)) search_url = "/search/movies/" + quote_plus(search_query) search_page = fetch_page(self.site_url, extra_path=search_url) pq = PyQuery(search_page) dom_search_list = pq(".list_item") film_list = [] for dom_item in dom_search_list: title = pq(dom_item).find('img[border="0"]').show().attr('alt') href = pq(dom_item).find('a.panel').attr('href') url = urljoin(self.site_url, href) category = Media.FILM film = Media(title=title, url=url, category=category) # set description desc = pq(dom_item).find('.plot').text() film.description = re.sub(r'\s', ' ', desc) # remove newlines from description film.rating = pq(dom_item).find('span.rank_value').text() # set thumbnail url href_thumbnail = pq(dom_item).find('img[border="0"]').show().attr('src') film.thumbnail = urljoin(self.site_url, href_thumbnail) film_list.append(film) return film_list
def get_release_date(package, version, data = None): """ Return the released_on date for this version. """ xml = fetch_page("http://pear.php.net/rest/r/%s/%s.xml" % (package.lower(), version), False).strip() dom = parseString(xml) released_on = dom.getElementsByTagName("da")[0] return datetime.strptime(released_on.firstChild.wholeText, '%Y-%m-%d %H:%M:%S')
def get_pkg_descr(package, version=None, last_modified=None): """ Get package description from registry """ json_data = fetch_page('http://registry.npmjs.org/%s' % package, last_modified=last_modified) if json_data is None: # NB: empty string is not None but will fail the check return None else: return json.loads(json_data)
def get_commits_around_date(repo, released_on, delta=1): """ Return commits which are possible to contain a version change for further analysis. @repo - string - :user/:repo combination @released_on - timestamp @return - { 'sha' : ['patch1', 'patch2'] } """ since = released_on - timedelta(days=delta) since = since.strftime('%Y-%m-%dT%H:%M:%S') until = released_on + timedelta(days=delta) until = until.strftime('%Y-%m-%dT%H:%M:%S') api_url = 'https://api.github.com/repos/%s/commits?since=%s&until=%s' if settings_imported: api_url += '&client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET) json_data = fetch_page(api_url % (repo, since, until)) if json_data is None: return None data = json.loads(json_data) result = {} for c in data: for p in c['parents']: result[p['sha']] = [] api_url = p['url'] if settings_imported: api_url += '?client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET) commit_json = fetch_page(api_url) if commit_json: commit_data = json.loads(commit_json) for f in commit_data['files']: if f.has_key('patch'): result[p['sha']].append(f['patch']) return result
def get_latest(package, last_checked=None, data=None): """ Get the latest version of a package @return - version, released_on """ # fetch JSON only using package name. Will fetch latest version if not data: data = fetch_page("https://packagist.org/packages/%s.json" % package, last_modified=last_checked) if data is None: # NB: empty string is not None but will fail the check return 304, 304 data = json.loads(data) if not data: logger.error("Can't find latest version - %s" % package) return "", None # sort versions by date and take the most recent one versions = {} for ver in data["package"]["versions"].keys(): # skip development verions if ver.find("dev-") > -1: continue if ver.find("-dev") > -1: continue if ver.find("master-") > -1: continue # skip Alpha and Beta versions # if ver.lower().find('beta') > -1: # continue # if ver.lower().find('alpha') > -1: # continue released_on = data["package"]["versions"][ver]["time"][:19] # remove timezone part released_on = datetime.strptime(released_on, "%Y-%m-%dT%H:%M:%S") versions[released_on] = ver # sort dates = versions.keys() dates.sort(reverse=True) release_date = dates[0] latest_ver = versions[release_date] return latest_ver, release_date
def get_latest(package, last_checked=None): """ Get the latest version of a package """ version = fetch_page("http://pear.php.net/rest/r/%s/latest.txt" % package.lower(), last_modified=last_checked) if version is not None: # NB: empty string is not None but will fail the check version = version.strip() else: return 304, 304 released_on = get_release_date(package, version) return version, released_on
def get_download_url(package, version, data=None): """ Return URL to download the package. """ if not data: data = fetch_page("https://packagist.org/packages/%s.json" % package) data = json.loads(data) try: return data["package"]["versions"][version]["dist"]["url"] except: return None
def get_release_date(package, version, data = None): """ Return the released_on date for this version. """ pkg_name = _other_name(package) if not data: data = fetch_page("https://pypi.python.org/pypi/%s/%s/json" % (pkg_name, version)) data = json.loads(data) if data.has_key('urls') and (len(data['urls']) >= 1) and data['urls'][0].has_key('upload_time'): return datetime.strptime(str(data['urls'][0]['upload_time']), '%Y-%m-%dT%H:%M:%S') else: return None
def get_release_date(package, version, data = None): """ Return the released_on date for this version. """ if not data: json_data = fetch_page('https://rubygems.org/api/v1/versions/%s.json' % package) # json_data = json_data.decode('UTF-8') data = json.loads(json_data) for ver in data: if ver['number'] == version: return datetime.strptime(ver['built_at'], '%Y-%m-%dT%H:%M:%SZ') return None
def get_release_date(package, version, data=None): """ Return the released_on date for this version. """ if not data: data = fetch_page("https://packagist.org/packages/%s.json" % package) data = json.loads(data) try: released_on = data["package"]["versions"][version]["time"] released_on = released_on[:19] # remove timezone part return datetime.strptime(released_on, "%Y-%m-%dT%H:%M:%S") except: raise return None
def get_release_date(package, version, data = None): """ Return the released_on date for this version. """ if not data: data = fetch_page("https://packagist.org/packages/%s.json" % package) data = json.loads(data) try: released_on = data['package']['versions'][version]['time'] released_on = released_on[:19] # remove timezone part return datetime.strptime(released_on, '%Y-%m-%dT%H:%M:%S') except: raise return None
def get_release_date(package, version, data=None): """ Return the released_on date for this version. """ if not data: json_data = fetch_page('https://rubygems.org/api/v1/versions/%s.json' % package) # json_data = json_data.decode('UTF-8') data = json.loads(json_data) for ver in data: if ver['number'] == version: return datetime.strptime(ver['built_at'], '%Y-%m-%dT%H:%M:%SZ') return None
def get_release_date(package, version, data = None): """ Return the released_on date for this version. """ [groupid, artifactid] = _groupid_artifactid(package) if not data: data = fetch_page('http://search.maven.org/solrsearch/select?q=g:"%s"+AND+a:"%s"+AND+v:"%s"&wt=json' % (groupid, artifactid, version) ) data = json.loads(data) released_on = data['response']['docs'][0]['timestamp'] released_on = datetime.fromtimestamp(released_on/1000) return released_on
def get_url(package, version=None): """ Return homepage, repo, bugtracker URLs for a package """ urls = { 'homepage' : '', 'repository' : '', 'bugtracker' : '', } [groupid, artifactid] = _groupid_artifactid(package) if not version: version, released_on = get_latest(pakage) pom_xml = fetch_page("http://search.maven.org/remotecontent?filepath=%s/%s/%s/%s-%s.pom" % (groupid.replace('.', '/'), artifactid, version, artifactid, version) ) dom = parseString(pom_xml) # search for homepage for url in dom.getElementsByTagName('url'): if not urls['homepage']: urls['homepage'] = url.firstChild.wholeText # prefer github URLs if url.firstChild.wholeText.find('github.com') > -1: urls['homepage'] = url.firstChild.wholeText break # search for code repository for conn in dom.getElementsByTagName('connection'): if not urls['repository']: urls['repository'] = conn.firstChild.wholeText # format is scm:type:URL or scm:type:scm:type:URL so remove prefix while urls['repository'].find('scm:') > -1: urls['repository'] = ":".join(urls['repository'].split(':')[2:]) break # search for bugtracker URL for tracker in dom.getElementsByTagName('issueManagement'): for url in tracker.getElementsByTagName('url'): if not urls['bugtracker']: urls['bugtracker'] = url.firstChild.wholeText break return urls
def get_release_date_from_commit(package, commit): """ Return the release date for this commit @package - :user/:repo combination @commit - commit hash """ api_url = 'https://api.github.com/repos/%s/commits/%s' if settings_imported: api_url += '?client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET) json_data = fetch_page(api_url % (package, commit)) if json_data is None: return None data = json.loads(json_data) released_on = data['commit']['committer']['date'] return datetime.strptime(released_on, '%Y-%m-%dT%H:%M:%SZ')
def get_url(package, version=None): """ Return homepage, repo, bugtracker URLs for a package """ urls = { 'homepage' : '', 'repository' : '', 'bugtracker' : '', } data = fetch_page("https://packagist.org/packages/%s.json" % package) data = json.loads(data) if (not data): logger.error("Can't find URL for %s-%s" % (package, version)) return urls if not version: version, released_on = get_latest(package, None, data) # home page try: urls['homepage'] = data['package']['versions'][version]['homepage'] if not data['package']['repository'].endswith('.git'): urls['homepage'] = data['package']['repository'] except: pass # repository try: if data['package']['repository'].endswith('.git'): urls['repository'] = data['package']['repository'] else: urls['repository'] = data['package']['versions'][version]['source']['url'] except: pass # NB: bugtracker will automatically be filled by helpers # b/c the format is not what we expect and # b/c every packagist package uses GitHub or BitBucket return urls
def get_files(url): """ Return a list of all files in the tree. Used to search for changelog. """ user_repo = _get_user_repo(url) if not user_repo: raise Exception("GitHub - get_files - can't find repository for %s" % url) api_url = 'https://api.github.com/repos/%s/git/trees/master?recursive=1' if settings_imported: api_url += '&client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET) data = fetch_page(api_url % user_repo) data = json.loads(data) return [f['path'] for f in data['tree'] if f['type'] == 'blob']
def scrape_sfsc(start=SFSC_START_CASE_NUM, end=None, skip=[]): """Main loop for scraping a batch of pages""" _continue = True _invalid_streak = 0 case_num = start records = [] while _continue: _case_num_str = SFSC_CASE_NUM_FORMAT(case_num) # to-do: Check for "invalid" page result print >> sys.stderr, _case_num_str if case_num in skip: print >> sys.stderr, 'Skipping...' else: page = scrape_page(_case_num_str) if page.status()=='multiple_results': # append a page for each result for f in page.soup.findAll('font'): for a in f.findAll('a'): _url = urlparse.urljoin(SFSC_POST_URL, a.get('href')) _html = utils.fetch_page(_url) _page = SFSCPage(_html, _case_num_str) yield _page else: yield page # Did we hit too many invalid pages in a row? # if so, skip ahead to next block and try again. Keep increasing by block until valid page found if page.status()=='invalid': _invalid_streak+=1 print >> sys.stderr, 'Invalid page for case num %d, number %d in a row' % (case_num, _invalid_streak) if _invalid_streak>=INVALID_PAGES_LIMIT: case_num = (case_num/SFSC_BLOCK_SIZE+1)*SFSC_BLOCK_SIZE-1 # subtract 1 since it will be added back at end of loop else: _invalid_streak=0 # check to see if we should break # Did we pass end value? case_num+=1 if end is not None: if case_num>end: _continue=False
def get_latest(package, last_checked=None): """ Get the latest version of a package """ json_data = fetch_page('http://rubygems.org/api/v1/versions/%s.json' % package, last_modified=last_checked) if json_data is None: # NB: empty string is not None but will fail the check return 304, 304 data = json.loads(json_data) for version in data: # i in range(0, len(data)): if version['prerelease']: continue else: return version['number'], get_release_date(package, version['number'], data) # in case there are only pre-release versions return None, None
def get_latest_from_rss(): """ Return list of (name, version, released_on) of the latest versions of packages added to the index. """ rss = fetch_page("https://pypi.python.org/pypi?:action=rss", False) dom = parseString(rss) result = [] for item in dom.getElementsByTagName("item"): try: title = item.getElementsByTagName("title")[0] pub_date = item.getElementsByTagName("pubDate")[0] (name, version) = title.firstChild.wholeText.split(" ") released_on = datetime.strptime(pub_date.firstChild.wholeText, '%d %b %Y %H:%M:%S GMT') result.append((name, version, released_on)) except: continue return result
def get_page_count(): page_count = 0 try: soup = utils.fetch_page('http://www.yousuu.com/category/all') page_indicator = soup.find(attrs={'class', 'books'}).ul.children for indicator_item in page_indicator: try: onclick = indicator_item.a['onclick'] page_count = max( page_count, int( str(onclick).split(',')[1].replace(')', '').replace( '\'', ''))) except Exception as e: print(e) print(indicator_item) except Exception as e: print(e) return max(1, page_count)
def get_url(package, version=None): """ Return homepage, repo, bugtracker URLs for a package """ urls = { 'homepage' : '', 'repository' : '', 'bugtracker' : '', } # Twisted packages are many and follow a pattern but not hosted on PyPI if package in _twisted_mappings.keys(): return { 'homepage' : 'http://twistedmatrix.com/trac/wiki/%s' % package.replace('-', ''), 'repository' : 'svn://svn.twistedmatrix.com/svn/Twisted/trunk', 'bugtracker' : 'http://twistedmatrix.com/trac/ticket/%d', } pkg_name = _other_name(package) if not version: version, released_on = get_latest(pkg_name) release_data = fetch_page("https://pypi.python.org/pypi/%s/%s/json" % (pkg_name, version)) release_data = json.loads(release_data) if (not release_data): logger.error("Can't find URL for %s-%s" % (package, version)) return urls # home page if release_data['info'].has_key('home_page') and release_data['info']['home_page']: urls['homepage'] = release_data['info']['home_page'] elif release_data['info'].has_key('package_url'): urls['homepage'] = release_data['info']['package_url'] # bugtracker if release_data['info'].has_key('bugtrack_url') and release_data['info']['bugtrack_url']: urls['bugtracker'] = release_data['info']['bugtrack_url'] return urls
def get_latest_from_rss(): """ Return list of (name, version, released_on) of the latest versions of packages added to the index. """ rss = fetch_page("http://pear.php.net/feeds/latest.rss") dom = parseString(rss) result = [] for item in dom.getElementsByTagName("item"): try: title = item.getElementsByTagName("title")[0] pub_date = item.getElementsByTagName("dc:date")[0] (name, version) = title.firstChild.wholeText.split(" ") # NB: PEAR provides a timezone offset but we consider all dates in UTC released_on = datetime.strptime(pub_date.firstChild.wholeText, '%Y-%m-%dT%H:%M:%S-05:00') result.append((name, version, released_on)) except: continue return result
def get_latest_from_rss(): """ Return list of (name, version, released_on) of the latest versions of packages added to the index. """ rss = fetch_page("http://pear2.php.net/?view=latest&format=rss") dom = parseString(rss) result = [] for item in dom.getElementsByTagName("item"): try: title = item.getElementsByTagName("title")[0] pub_date = item.getElementsByTagName("dc:date")[0] (name, version) = title.firstChild.wholeText.split(" ") # NB: PEAR provides a timezone offset but we consider all dates in UTC released_on = datetime.strptime(pub_date.firstChild.wholeText, '%Y-%m-%dT%H:%M:%S+00:00') result.append((name, version, released_on)) except: continue return result
def get_books_from(books, page): try: soup = utils.fetch_page( 'http://www.yousuu.com/category/all?page={}'.format(page)) for book_subject in soup.find_all( attrs={'class', 'bd booklist-subject'}): book = {} for div in book_subject.children: if div['class'] == 'pull-right hidden-xs btn-group initshelf'.split( ' '): book['id'] = div['data-bid'] elif div['class'] == ['post']: book['cover'] = div.a.img['src'] elif div['class'] == ['title']: book['name'] = div.a.string elif div['class'] == ['rating']: # 评分的人数 book['rating_num'] = int( re.findall('\d+', re.findall(r'(\d+人评价)', str(div))[0])[0]) elif div['class'] == ['abstract']: book['rating'] = float(div.span.string) # 贪婪匹配,使用* 或者 + 时都是匹配最长的字符串,加一个?则可以匹配最短的字符串 author_name_line = re.findall(r'作者:\s[\s\S]*?<br/>', str(div))[0] # 去除头和尾部,使用sub函数替换 book['author_name'] = re.sub(r'作者:\s|<br/>', '', author_name_line) word_num_line = re.findall(r'字数:\s[\s\S]*?<br/>', str(div))[0] book['word_num'] = re.sub(r'字数:\s|<br/>', '', word_num_line).strip() updated_time_line = re.findall(r'最后更新:\s[\s\S]*?<br/>', str(div))[0] book['updated_time'] = re.sub(r'最后更新:\s|<br/>', '', updated_time_line).strip() books.append(book) except Exception as e: print(e)
def get_author_from_html(package, version): """ Try to parse the HTML page and get the author for an older version. """ package = _other_name(package) package = package.replace('::', '-') html = fetch_page('https://metacpan.org/release/%s' % package).split('\n') options = [] for line in html: if line.find('<option value=') > -1: options.append(line.strip()) # format is # <option value="ADAMK/PPI-HTML-0.01/">0.01 (2005-01-15)</option> for opt in options: author = opt.split('"')[1].split('/')[0] ver = opt.split('>')[1].split(' ')[0] if ver == version: return author return None
rx = "#\d+" # repeat so we can get a clean expression for e in exprs: rx = rx + "|" + e # print rx return re.compile(rx) if __name__ == "__main__": from utils import fetch_page # NB: order is the same as in extract_title_and_dates_from_html() print extract_title_and_dates_from_html( fetch_page('https://github.com/cowboyd/therubyracer/pull/240'), BUG_TYPE_GITHUB) print extract_title_and_dates_from_html( fetch_page('https://github.com/boto/boto/issues/1757'), BUG_TYPE_GITHUB) print extract_title_and_dates_from_html( fetch_page('https://github.com/marcandre/backports/pull/63'), BUG_TYPE_GITHUB) print extract_title_and_dates_from_html( fetch_page('https://github.com/boto/boto/pull/407'), BUG_TYPE_GITHUB) print extract_title_and_dates_from_html( fetch_page('https://github.com/hmarr/django-ses/pull/40'), BUG_TYPE_GITHUB) print extract_title_and_dates_from_html( fetch_page('https://github.com/schacon/hg-git/issues/31'), BUG_TYPE_GITHUB)
# print extract_title_and_dates_from_html(fetch_page('https://github.com/hmarr/django-ses/pull/40'), BUG_TYPE_GITHUB) # print extract_title_and_dates_from_html(fetch_page('https://github.com/schacon/hg-git/issues/31'), BUG_TYPE_GITHUB) # print extract_title_and_dates_from_html(fetch_page('https://bugzilla.redhat.com/show_bug.cgi?id=800754'), BUG_TYPE_BUGZILLA) # print extract_title_and_dates_from_html(fetch_page('https://bitbucket.org/birkenfeld/pygments-main/issue/763'), BUG_TYPE_BITBUCKET) # print extract_title_and_dates_from_html(fetch_page('https://bitbucket.org/birkenfeld/pygments-main/issue/861'), BUG_TYPE_BITBUCKET) # print extract_title_and_dates_from_html(fetch_page('https://bugs.launchpad.net/pytz/+bug/207604'), BUG_TYPE_LAUNCHPAD) # print extract_title_and_dates_from_html(fetch_page('http://code.google.com/p/pysqlite/issues/detail?id=11'), BUG_TYPE_GOOGLE) # print extract_title_and_dates_from_html(fetch_page('http://code.google.com/p/pysqlite/issues/detail?id=23'), BUG_TYPE_GOOGLE) # print extract_title_and_dates_from_html(fetch_page('http://code.google.com/p/geopy/issues/detail?id=2'), BUG_TYPE_GOOGLE) print extract_title_and_dates_from_html(fetch_page('http://foolscap.lothar.com/trac/ticket/204'), BUG_TYPE_TRAC) print extract_title_and_dates_from_html(fetch_page('https://code.djangoproject.com/ticket/18436'), BUG_TYPE_TRAC) # print extract_title_and_dates_from_html(fetch_page('http://bugs.repoze.org/issue4'), BUG_TYPE_ROUNDUP) # print extract_title_and_dates_from_html(fetch_page('http://bugs.repoze.org/issue85'), BUG_TYPE_ROUNDUP) # print extract_title_and_dates_from_html(fetch_page('http://bugs.repoze.org/issue43'), BUG_TYPE_ROUNDUP) # print extract_title_and_dates_from_html(fetch_page('http://sourceforge.net/tracker/?func=detail&aid=3552403&group_id=38414&atid=422030'), BUG_TYPE_SOURCEFORGE) # print extract_title_and_dates_from_html(fetch_page('http://sourceforge.net/p/pydev/bugs/1558/'), BUG_TYPE_SOURCEFORGE) # print extract_title_and_dates_from_html(fetch_page('http://psycopg.lighthouseapp.com/projects/62710/tickets/83'), BUG_TYPE_LIGHTHOUSE) # print extract_title_and_dates_from_html(fetch_page('http://psycopg.lighthouseapp.com/projects/62710/tickets/78'), BUG_TYPE_LIGHTHOUSE) # print extract_title_and_dates_from_html(fetch_page('http://psycopg.lighthouseapp.com/projects/62710/tickets/146'), BUG_TYPE_LIGHTHOUSE) # print extract_title_and_dates_from_html(fetch_page('http://psycopg.lighthouseapp.com/projects/62710/tickets/112'), BUG_TYPE_LIGHTHOUSE)