Exemplo n.º 1
0
def get_latest_from_rss():
    """
        Return list of (name, version, released_on) of the
        latest versions of packages added to the index.
    """
    # NB: limit=50 is hardcoded upstream
    rss = fetch_page("http://registry.npmjs.org/-/rss?descending=true&limit=50", decode=False)
    soup = BeautifulSoup(rss)
    result = []

    for item in soup.findAll("item"):
        try:
            pub_date = item.pubdate.text

            (name, version) = item.title.text.split("@")

            # NB: <pubDate>2012-06-27T20:29:22.550Z</pubDate>
            # Python doesn't have format string for miliseconds so strip it out
            released_on = datetime.strptime(pub_date.split('.')[0], '%Y-%m-%dT%H:%M:%S')

            result.append((name, version, released_on))
        except:
            continue

    return result
Exemplo n.º 2
0
def get_tags(url, user_repo=None, extended=False, last_modified=None):
    """
        Return all GitHub tags for this package.
        @url - string - either git checkout url or http url for the GitHub page
    """

    if user_repo is None:
        user_repo = _get_user_repo(url)

    if not user_repo:
        raise Exception("GitHub - get_tags - can't find repository for %s" % url)

    api_url = 'https://api.github.com/repos/%s/tags'
    if settings_imported:
        api_url += '?client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET)
    json_data = fetch_page(api_url % user_repo, last_modified=last_modified)

    if json_data is None:
        return None

    data = json.loads(json_data)

    if extended:
        return data
    else:
        try:
            result = {}
            for tag in data:
                result[tag['name']] = tag['commit']['sha']
            return result
        except:
            # in case GitHub API limit was reached then above indexing will fail
            return {}
Exemplo n.º 3
0
    def search_tvshow(self, search_query):
        logging.info('Searching tvshow for query: {}'.format(search_query))

        search_url = "/search/tv-shows/" + quote_plus(search_query)
        search_page = fetch_page(self.site_url, extra_path=search_url)
        pq = PyQuery(search_page)

        dom_search_list = pq(u".list_item")
        tvshow_list = []

        for dom_item in dom_search_list:
            title = pq(dom_item).find('img[border="0"]').show().attr('alt')
            href = pq(dom_item).find('a.panel').attr('href')
            url = urljoin(self.site_url, href)
            category = Media.TVSHOW

            # Since it is a tvshow we need to fetch the children episodes
            tvshow = Media(title=title, url=url, category=category, has_children=True)

            # set description
            desc = pq(dom_item).find('.plot').text()
            tvshow.description = re.sub('\s', ' ', desc)  # remove newlines from description

            # set rating
            tvshow.rating = pq(dom_item).find('span.rank_value').text()

            # set thumbnail url
            href_thumbnail = pq(dom_item).find('img[border="0"]').show().attr('src')
            tvshow.thumbnail = urljoin(self.site_url, href_thumbnail)

            tvshow_list.append(tvshow)

        return tvshow_list
Exemplo n.º 4
0
def get_url(package, version=None):
    """
        Return homepage, repo, bugtracker URLs for a package
    """
    urls = {
            'homepage'   : '',
            'repository' : '',
            'bugtracker' : '',
        }

    json_data = fetch_page('https://rubygems.org/api/v1/gems/%s.json' % package)
#    json_data = json_data.decode('UTF-8')
    data = json.loads(json_data)

    if data.has_key('homepage_uri'):
        urls['homepage'] = data['homepage_uri']
    else:
        urls['homepage'] = data['project_uri']

    if data.has_key('bug_tracker_uri') and data['bug_tracker_uri']:
        urls['bugtracker'] = data['bug_tracker_uri']

    if data.has_key('source_code_uri') and data['source_code_uri']:
        urls['repository'] = data['source_code_uri']

    return urls
Exemplo n.º 5
0
def get_latest_from_rss():
    """
        @return - list of (name. version, released_on)
    """
    data = fetch_page("https://rubygems.org/api/v1/activity/just_updated.json")
    latest = json.loads(data)
    result = []

    for gem in latest:
# NB: not implemented
# see https://github.com/rubygems/rubygems.org/issues/536
#        if gem['prerelease']:
#            continue

        # don't add prerelease software
        (latest_ver, released_on) = get_latest(gem['name'])

#todo: this JSON give more info like GitHub URLs import from here 
# and kill some messages

        if latest_ver == gem['version']:
            # RubyGems.org doesn't provide date of release
            result.append((gem['name'], gem['version'], released_on))

    return result
Exemplo n.º 6
0
def get_latest(package, last_checked=None):
    """
        Get the latest version of a package
    """
    package = _other_name(package)
    package = package.replace('::', '-')

    json_data = fetch_page('http://api.metacpan.org/v0/release/%s' % package, last_modified=last_checked)

    if json_data is None: # NB: empty string is not None but will fail the check
        return 304, 304

    data = json.loads(json_data)

    # don't process DEV/TRIAL packages
    # devel versions for some packages like Moose
    # mix up with regular versions
    if data['maturity'] != "released":
        return None, None

    version = None
    if data.has_key('version'):
        version = data['version']
    elif data.has_key('version_numified'):
        version = data['version_numified']

    released_on = get_release_date(package, version, data)

    return version, released_on
Exemplo n.º 7
0
def get_latest_from_rss():
    """
        @return - list of (name, version, released_on)
    """
    rss = fetch_page("https://metacpan.org/feed/recent?f=l") # filter=latest
    dom = parseString(rss)
    result = []
    for item in dom.getElementsByTagName("item"):
        try:
            # titles are in the form Dist-Zilla-Plugin-Test-PodSpelling-2.002005
            title_parts = item.getElementsByTagName("title")[0].firstChild.wholeText.split("-")

            version = title_parts[-1] # version is always the last component

            # skip DEV versions
#todo: better match, this is of the form MAJOR.MINOR_DEV
            if version.find('_') > -1:
                continue

            name = '::'.join(title_parts[:-1])

            pub_date = item.getElementsByTagName("dc:date")[0]
            released_on = datetime.strptime(pub_date.firstChild.wholeText, '%Y-%m-%dT%H:%M:%SZ')

            result.append((name, version, released_on))
        except:
            continue

    return result
Exemplo n.º 8
0
def get_download_url(package, version, data = None):
    """
        Return URL to download the package.
    """

    # Twisted packages are many and follow a pattern but not hosted on PyPI
    if package in _twisted_mappings.keys():
        sub_package = package.split('-')[1]
        main_ver = '.'.join(version.split('.')[:2])
        return 'http://twistedmatrix.com/Releases/%s/%s/Twisted%s-%s.tar.bz2' % (sub_package, main_ver, sub_package, version)

    pkg_name = _other_name(package)

    if not data:
        data = fetch_page("https://pypi.python.org/pypi/%s/%s/json" % (pkg_name, version))
        data = json.loads(data)

    if data.has_key('urls'):
        for file in data['urls']:
            # consider only source packages
            if file['packagetype'] == 'sdist':
                return file['url']

    if data.has_key('info') and data['info'].has_key('download_url'):
        url = data['info']['download_url']
        for ext in SUPPORTED_ARCHIVES:
            if url.endswith(ext):
                return url

    return None
Exemplo n.º 9
0
def get_latest(package, last_checked=None):
    """
        Get the latest version of a package

        @return - version, released_on
    """

    pkg_name = _other_name(package)

    # fetch JSON only using package name. Will fetch latest version
    versions = fetch_page("https://pypi.python.org/pypi/%s/json" % pkg_name, last_modified=last_checked)

    if versions is None: # NB: empty string is not None but will fail the check
        return 304, 304

    versions = json.loads(versions)

    if (not versions):
        logger.error("Can't find latest version - %s" % package)
        return "", None

    latest_ver = versions['info']['version']
    release_date = get_release_date(pkg_name, latest_ver, versions)

    return latest_ver, release_date
Exemplo n.º 10
0
def get_latest_from_rss():
    """
        Return list of (name, version, released_on) of the
        latest versions of packages added to the index.

    NB: See https://getsatisfaction.com/sonatype/topics/rss_feeds_for_artifact_group_updates
        looks like this feature is not well maintained or publicly advertised.

    """
# 404
#    rss = fetch_page("http://search.maven.org/remotecontent?filepath=rss.xml")
    rss = fetch_page("http://repo1.maven.org/maven2/rss.xml")

    dom = parseString(rss)
    result = []
    for item in dom.getElementsByTagName("item"):
        try:
            title = item.getElementsByTagName("title")[0]
            pub_date = item.getElementsByTagName("pubDate")[0]

            (gid, aid, version) = title.firstChild.wholeText.split(":")
            released_on = datetime.strptime(pub_date.firstChild.wholeText, '%a, %d %b %Y %H:%M:%S -0500')
            result.append(("%s:%s" % (gid, aid), version, released_on))
        except:
            continue

    return result
Exemplo n.º 11
0
def get_latest(package, last_checked=None):
    """
        Get the latest version of a package

        @return - version, released_on
    """
    [groupid, artifactid] = _groupid_artifactid(package)

    data = fetch_page('http://search.maven.org/solrsearch/select?q=g:"%s"+AND+a:"%s"&wt=json&core=gav' % (groupid, artifactid), last_modified=last_checked)

    if data is None: # NB: empty string is not None but will fail the check
        return 304, 304

    data = json.loads(data)

    latest_ver = None
    latest_timestamp = 0

    for release in data['response']['docs']:
        if release['timestamp'] > latest_timestamp:
            latest_timestamp = release['timestamp']
            latest_ver = release['v']

    released_on = datetime.fromtimestamp(latest_timestamp/1000)
    return latest_ver, released_on
Exemplo n.º 12
0
    def get_streams(self, media):
        logging.info("Extracting streams for: {}".format(media))

        if not media.url:
            logging.warn("{} has no url".format(media))
            return None

        pq = PyQuery(fetch_page(media.url))

        stream_list = []
        for href in (a.attrib.get('href') for a in pq('#links_list .link a:not([class])')):
            # clean whitespace and quotes from string
            href = href.replace(" ", "").replace("'", "").replace('"', "")
            try:
                video_host = re.search(r'\((?P<vid>.*?),(?:.*?),(?P<host>.*?)\);', href).group('host')
                video_id = re.search(r'\((?P<vid>.*?),(?:.*?),(?P<host>.*?)\);', href).group('vid')
                video_url = None

                stream = Stream(video_id, video_host, video_url)
                logging.info("Retrieved: {}".format(stream))

                stream_list.append(stream)
            except AttributeError:
                # if an exception occured the href_rgx couldn't match something on href
                logging.error("Couldn't get video info from: {}".format(href))
                pass

        return stream_list
Exemplo n.º 13
0
    def search_film(self, search_query):
        logging.info('Searching film for query: {}'.format(search_query))

        search_url = "/search/movies/" + quote_plus(search_query)
        search_page = fetch_page(self.site_url, extra_path=search_url)
        pq = PyQuery(search_page)

        dom_search_list = pq(".list_item")
        film_list = []
        for dom_item in dom_search_list:
            title = pq(dom_item).find('img[border="0"]').show().attr('alt')
            href = pq(dom_item).find('a.panel').attr('href')
            url = urljoin(self.site_url, href)
            category = Media.FILM

            film = Media(title=title, url=url, category=category)

            # set description
            desc = pq(dom_item).find('.plot').text()
            film.description = re.sub(r'\s', ' ', desc)  # remove newlines from description
            film.rating = pq(dom_item).find('span.rank_value').text()

            # set thumbnail url
            href_thumbnail = pq(dom_item).find('img[border="0"]').show().attr('src')
            film.thumbnail = urljoin(self.site_url, href_thumbnail)

            film_list.append(film)

        return film_list
Exemplo n.º 14
0
def get_release_date(package, version, data = None):
    """
        Return the released_on date for this version.
    """
    xml = fetch_page("http://pear.php.net/rest/r/%s/%s.xml" % (package.lower(), version), False).strip()
    dom = parseString(xml)
    released_on = dom.getElementsByTagName("da")[0]
    return datetime.strptime(released_on.firstChild.wholeText, '%Y-%m-%d %H:%M:%S')
Exemplo n.º 15
0
def get_pkg_descr(package, version=None, last_modified=None):
    """
        Get package description from registry
    """
    json_data = fetch_page('http://registry.npmjs.org/%s' % package, last_modified=last_modified)

    if json_data is None: # NB: empty string is not None but will fail the check
        return None
    else:
        return json.loads(json_data)
Exemplo n.º 16
0
def get_commits_around_date(repo, released_on, delta=1):
    """
        Return commits which are possible to contain a version change
        for further analysis.

        @repo - string - :user/:repo combination
        @released_on - timestamp

        @return - { 'sha' : ['patch1', 'patch2'] }
    """

    since = released_on - timedelta(days=delta)
    since = since.strftime('%Y-%m-%dT%H:%M:%S')

    until = released_on + timedelta(days=delta)
    until = until.strftime('%Y-%m-%dT%H:%M:%S')

    api_url = 'https://api.github.com/repos/%s/commits?since=%s&until=%s'
    if settings_imported:
        api_url += '&client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET)

    json_data = fetch_page(api_url % (repo, since, until))

    if json_data is None:
        return None

    data = json.loads(json_data)

    result = {}
    for c in data:
        for p in c['parents']:
            result[p['sha']] = []
            api_url = p['url']
            if settings_imported:
                api_url += '?client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET)
            commit_json = fetch_page(api_url)
            if commit_json:
                commit_data = json.loads(commit_json)
                for f in commit_data['files']:
                    if f.has_key('patch'):
                        result[p['sha']].append(f['patch'])
    return result
Exemplo n.º 17
0
def get_latest(package, last_checked=None, data=None):
    """
        Get the latest version of a package

        @return - version, released_on
    """

    # fetch JSON only using package name. Will fetch latest version
    if not data:
        data = fetch_page("https://packagist.org/packages/%s.json" % package, last_modified=last_checked)

        if data is None:  # NB: empty string is not None but will fail the check
            return 304, 304

        data = json.loads(data)

    if not data:
        logger.error("Can't find latest version - %s" % package)
        return "", None

    # sort versions by date and take the most recent one

    versions = {}
    for ver in data["package"]["versions"].keys():
        # skip development verions
        if ver.find("dev-") > -1:
            continue

        if ver.find("-dev") > -1:
            continue

        if ver.find("master-") > -1:
            continue

        # skip Alpha and Beta versions
        #        if ver.lower().find('beta') > -1:
        #            continue

        #        if ver.lower().find('alpha') > -1:
        #            continue

        released_on = data["package"]["versions"][ver]["time"][:19]  # remove timezone part
        released_on = datetime.strptime(released_on, "%Y-%m-%dT%H:%M:%S")

        versions[released_on] = ver

    # sort
    dates = versions.keys()
    dates.sort(reverse=True)

    release_date = dates[0]
    latest_ver = versions[release_date]

    return latest_ver, release_date
Exemplo n.º 18
0
def get_latest(package, last_checked=None):
    """
        Get the latest version of a package
    """
    version = fetch_page("http://pear.php.net/rest/r/%s/latest.txt" % package.lower(), last_modified=last_checked)
    if version is not None:  # NB: empty string is not None but will fail the check
        version = version.strip()
    else:
        return 304, 304

    released_on = get_release_date(package, version)
    return version, released_on
Exemplo n.º 19
0
def get_download_url(package, version, data=None):
    """
        Return URL to download the package.
    """

    if not data:
        data = fetch_page("https://packagist.org/packages/%s.json" % package)
        data = json.loads(data)

    try:
        return data["package"]["versions"][version]["dist"]["url"]
    except:
        return None
Exemplo n.º 20
0
def get_release_date(package, version, data = None):
    """
        Return the released_on date for this version.
    """

    pkg_name = _other_name(package)

    if not data:
        data = fetch_page("https://pypi.python.org/pypi/%s/%s/json" % (pkg_name, version))
        data = json.loads(data)

    if data.has_key('urls') and (len(data['urls']) >= 1) and data['urls'][0].has_key('upload_time'):
        return datetime.strptime(str(data['urls'][0]['upload_time']), '%Y-%m-%dT%H:%M:%S')
    else:
        return None
Exemplo n.º 21
0
def get_release_date(package, version, data = None):
    """
        Return the released_on date for this version.
    """

    if not data:
        json_data = fetch_page('https://rubygems.org/api/v1/versions/%s.json' % package)
#        json_data = json_data.decode('UTF-8')
        data = json.loads(json_data)

    for ver in data:
        if ver['number'] == version:
            return datetime.strptime(ver['built_at'], '%Y-%m-%dT%H:%M:%SZ')

    return None
Exemplo n.º 22
0
def get_release_date(package, version, data=None):
    """
        Return the released_on date for this version.
    """

    if not data:
        data = fetch_page("https://packagist.org/packages/%s.json" % package)
        data = json.loads(data)

    try:
        released_on = data["package"]["versions"][version]["time"]
        released_on = released_on[:19]  # remove timezone part
        return datetime.strptime(released_on, "%Y-%m-%dT%H:%M:%S")
    except:
        raise
        return None
Exemplo n.º 23
0
def get_release_date(package, version, data = None):
    """
        Return the released_on date for this version.
    """

    if not data:
        data = fetch_page("https://packagist.org/packages/%s.json" % package)
        data = json.loads(data)

    try:
        released_on = data['package']['versions'][version]['time']
        released_on = released_on[:19] # remove timezone part
        return datetime.strptime(released_on, '%Y-%m-%dT%H:%M:%S')
    except:
        raise
        return None
Exemplo n.º 24
0
def get_release_date(package, version, data=None):
    """
        Return the released_on date for this version.
    """

    if not data:
        json_data = fetch_page('https://rubygems.org/api/v1/versions/%s.json' %
                               package)
        #        json_data = json_data.decode('UTF-8')
        data = json.loads(json_data)

    for ver in data:
        if ver['number'] == version:
            return datetime.strptime(ver['built_at'], '%Y-%m-%dT%H:%M:%SZ')

    return None
Exemplo n.º 25
0
def get_release_date(package, version, data = None):
    """
        Return the released_on date for this version.
    """
    [groupid, artifactid] = _groupid_artifactid(package)

    if not data:
        data = fetch_page('http://search.maven.org/solrsearch/select?q=g:"%s"+AND+a:"%s"+AND+v:"%s"&wt=json' %
                        (groupid, artifactid, version)
                )
        data = json.loads(data)

    released_on = data['response']['docs'][0]['timestamp']
    released_on = datetime.fromtimestamp(released_on/1000)

    return released_on
Exemplo n.º 26
0
def get_url(package, version=None):
    """
        Return homepage, repo, bugtracker URLs for a package
    """
    urls = {
            'homepage'   : '',
            'repository' : '',
            'bugtracker' : '',
        }
    [groupid, artifactid] = _groupid_artifactid(package)

    if not version:
        version, released_on = get_latest(pakage)

    pom_xml = fetch_page("http://search.maven.org/remotecontent?filepath=%s/%s/%s/%s-%s.pom" % 
                            (groupid.replace('.', '/'), artifactid, version, artifactid, version)
                )
    dom = parseString(pom_xml)

    # search for homepage
    for url in dom.getElementsByTagName('url'):
        if not urls['homepage']:
            urls['homepage'] = url.firstChild.wholeText

        # prefer github URLs
        if url.firstChild.wholeText.find('github.com') > -1:
            urls['homepage'] = url.firstChild.wholeText
            break

    # search for code repository
    for conn in dom.getElementsByTagName('connection'):
        if not urls['repository']:
            urls['repository'] = conn.firstChild.wholeText
            # format is scm:type:URL or scm:type:scm:type:URL so remove prefix
            while urls['repository'].find('scm:') > -1:
                urls['repository'] = ":".join(urls['repository'].split(':')[2:])
            break

    # search for bugtracker URL
    for tracker in dom.getElementsByTagName('issueManagement'):
        for url in tracker.getElementsByTagName('url'):
            if not urls['bugtracker']:
                urls['bugtracker'] = url.firstChild.wholeText
                break

    return urls
Exemplo n.º 27
0
def get_release_date_from_commit(package, commit):
    """
        Return the release date for this commit

        @package - :user/:repo combination
        @commit - commit hash
    """
    api_url = 'https://api.github.com/repos/%s/commits/%s'
    if settings_imported:
        api_url += '?client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET)
    json_data = fetch_page(api_url % (package, commit))

    if json_data is None:
        return None

    data = json.loads(json_data)
    released_on = data['commit']['committer']['date']
    return datetime.strptime(released_on, '%Y-%m-%dT%H:%M:%SZ')
Exemplo n.º 28
0
def get_url(package, version=None):
    """
        Return homepage, repo, bugtracker URLs for a package
    """
    urls = {
            'homepage'   : '',
            'repository' : '',
            'bugtracker' : '',
        }

    data = fetch_page("https://packagist.org/packages/%s.json" % package)
    data = json.loads(data)

    if (not data):
        logger.error("Can't find URL for %s-%s" % (package, version))
        return urls

    if not version:
        version, released_on = get_latest(package, None, data)

    # home page
    try:
        urls['homepage'] = data['package']['versions'][version]['homepage']

        if not data['package']['repository'].endswith('.git'):
            urls['homepage'] = data['package']['repository']
    except:
        pass

    # repository
    try:
        if data['package']['repository'].endswith('.git'):
            urls['repository'] = data['package']['repository']
        else:
            urls['repository'] = data['package']['versions'][version]['source']['url']
    except:
        pass

# NB: bugtracker will automatically be filled by helpers
# b/c the format is not what we expect and
# b/c every packagist package uses GitHub or BitBucket

    return urls
Exemplo n.º 29
0
def get_files(url):
    """
        Return a list of all files in the tree.
        Used to search for changelog.
    """

    user_repo = _get_user_repo(url)

    if not user_repo:
        raise Exception("GitHub - get_files - can't find repository for %s" % url)

    api_url = 'https://api.github.com/repos/%s/git/trees/master?recursive=1'
    if settings_imported:
        api_url += '&client_id=%s&client_secret=%s' % (GITHUB_APP_ID, GITHUB_API_SECRET)

    data = fetch_page(api_url % user_repo)
    data = json.loads(data)

    return [f['path'] for f in data['tree'] if f['type'] == 'blob']
Exemplo n.º 30
0
def scrape_sfsc(start=SFSC_START_CASE_NUM, end=None, skip=[]):
	"""Main loop for scraping a batch of pages"""
	_continue = True
	_invalid_streak = 0
	
	case_num = start
	records = []
	while _continue:
		_case_num_str = SFSC_CASE_NUM_FORMAT(case_num)
		# to-do: Check for "invalid" page result
		print >> sys.stderr, _case_num_str
		if case_num in skip:
			print >> sys.stderr, 'Skipping...'
		
		else:
			page = scrape_page(_case_num_str)
			if page.status()=='multiple_results':
				# append a page for each result
				for f in page.soup.findAll('font'):
					for a in f.findAll('a'):
						_url = urlparse.urljoin(SFSC_POST_URL, a.get('href'))
						_html = utils.fetch_page(_url)
						_page = SFSCPage(_html, _case_num_str)
						yield _page
			else:
				yield page

			# Did we hit too many invalid pages in a row?
			# if so, skip ahead to next block and try again. Keep increasing by block until valid page found
			if page.status()=='invalid':
				_invalid_streak+=1
				print >> sys.stderr, 'Invalid page for case num %d, number %d in a row' % (case_num, _invalid_streak)
				if _invalid_streak>=INVALID_PAGES_LIMIT:
					case_num = (case_num/SFSC_BLOCK_SIZE+1)*SFSC_BLOCK_SIZE-1
					# subtract 1 since it will be added back at end of loop
			else:
				_invalid_streak=0
			# check to see if we should break
			# Did we pass end value?
		case_num+=1
		if end is not None:
			if case_num>end:
				_continue=False
Exemplo n.º 31
0
def get_latest(package, last_checked=None):
    """
        Get the latest version of a package
    """
    json_data = fetch_page('http://rubygems.org/api/v1/versions/%s.json' % package, last_modified=last_checked)

    if json_data is None: # NB: empty string is not None but will fail the check
        return 304, 304

    data = json.loads(json_data)

    for version in data: # i in range(0, len(data)):
        if version['prerelease']:
            continue
        else:
            return version['number'], get_release_date(package, version['number'], data)

    # in case there are only pre-release versions
    return None, None
Exemplo n.º 32
0
def get_latest_from_rss():
    """
        Return list of (name, version, released_on) of the
        latest versions of packages added to the index.
    """
    rss = fetch_page("https://pypi.python.org/pypi?:action=rss", False)
    dom = parseString(rss)
    result = []
    for item in dom.getElementsByTagName("item"):
        try:
            title = item.getElementsByTagName("title")[0]
            pub_date = item.getElementsByTagName("pubDate")[0]

            (name, version) = title.firstChild.wholeText.split(" ")
            released_on = datetime.strptime(pub_date.firstChild.wholeText, '%d %b %Y %H:%M:%S GMT')
            result.append((name, version, released_on))
        except:
            continue

    return result
Exemplo n.º 33
0
def get_page_count():
    page_count = 0
    try:
        soup = utils.fetch_page('http://www.yousuu.com/category/all')
        page_indicator = soup.find(attrs={'class', 'books'}).ul.children
        for indicator_item in page_indicator:
            try:
                onclick = indicator_item.a['onclick']
                page_count = max(
                    page_count,
                    int(
                        str(onclick).split(',')[1].replace(')', '').replace(
                            '\'', '')))
            except Exception as e:
                print(e)

            print(indicator_item)
    except Exception as e:
        print(e)
    return max(1, page_count)
Exemplo n.º 34
0
def get_url(package, version=None):
    """
        Return homepage, repo, bugtracker URLs for a package
    """
    urls = {
            'homepage'   : '',
            'repository' : '',
            'bugtracker' : '',
        }

    # Twisted packages are many and follow a pattern but not hosted on PyPI
    if package in _twisted_mappings.keys():
        return {
            'homepage' : 'http://twistedmatrix.com/trac/wiki/%s' % package.replace('-', ''),
            'repository' : 'svn://svn.twistedmatrix.com/svn/Twisted/trunk',
            'bugtracker' : 'http://twistedmatrix.com/trac/ticket/%d',
        }

    pkg_name = _other_name(package)

    if not version:
        version, released_on = get_latest(pkg_name)

    release_data = fetch_page("https://pypi.python.org/pypi/%s/%s/json" % (pkg_name, version))
    release_data = json.loads(release_data)

    if (not release_data):
        logger.error("Can't find URL for %s-%s" % (package, version))
        return urls

    # home page
    if release_data['info'].has_key('home_page') and release_data['info']['home_page']:
        urls['homepage'] = release_data['info']['home_page']
    elif release_data['info'].has_key('package_url'):
        urls['homepage'] = release_data['info']['package_url']

    # bugtracker
    if release_data['info'].has_key('bugtrack_url') and release_data['info']['bugtrack_url']:
        urls['bugtracker'] = release_data['info']['bugtrack_url']

    return urls
Exemplo n.º 35
0
def get_latest_from_rss():
    """
        Return list of (name, version, released_on) of the
        latest versions of packages added to the index.
    """
    rss = fetch_page("http://pear.php.net/feeds/latest.rss")
    dom = parseString(rss)
    result = []
    for item in dom.getElementsByTagName("item"):
        try:
            title = item.getElementsByTagName("title")[0]
            pub_date = item.getElementsByTagName("dc:date")[0]

            (name, version) = title.firstChild.wholeText.split(" ")
            # NB: PEAR provides a timezone offset but we consider all dates in UTC
            released_on = datetime.strptime(pub_date.firstChild.wholeText, '%Y-%m-%dT%H:%M:%S-05:00')
            result.append((name, version, released_on))
        except:
            continue

    return result
Exemplo n.º 36
0
def get_latest_from_rss():
    """
        Return list of (name, version, released_on) of the
        latest versions of packages added to the index.
    """
    rss = fetch_page("http://pear2.php.net/?view=latest&format=rss")
    dom = parseString(rss)
    result = []
    for item in dom.getElementsByTagName("item"):
        try:
            title = item.getElementsByTagName("title")[0]
            pub_date = item.getElementsByTagName("dc:date")[0]

            (name, version) = title.firstChild.wholeText.split(" ")
            # NB: PEAR provides a timezone offset but we consider all dates in UTC
            released_on = datetime.strptime(pub_date.firstChild.wholeText, '%Y-%m-%dT%H:%M:%S+00:00')
            result.append((name, version, released_on))
        except:
            continue

    return result
Exemplo n.º 37
0
def get_books_from(books, page):
    try:
        soup = utils.fetch_page(
            'http://www.yousuu.com/category/all?page={}'.format(page))
        for book_subject in soup.find_all(
                attrs={'class', 'bd booklist-subject'}):
            book = {}
            for div in book_subject.children:
                if div['class'] == 'pull-right hidden-xs btn-group initshelf'.split(
                        ' '):
                    book['id'] = div['data-bid']
                elif div['class'] == ['post']:
                    book['cover'] = div.a.img['src']
                elif div['class'] == ['title']:
                    book['name'] = div.a.string
                elif div['class'] == ['rating']:
                    # 评分的人数
                    book['rating_num'] = int(
                        re.findall('\d+',
                                   re.findall(r'(\d+人评价)', str(div))[0])[0])
                elif div['class'] == ['abstract']:
                    book['rating'] = float(div.span.string)
                    # 贪婪匹配,使用* 或者 + 时都是匹配最长的字符串,加一个?则可以匹配最短的字符串
                    author_name_line = re.findall(r'作者:\s[\s\S]*?<br/>',
                                                  str(div))[0]
                    # 去除头和尾部,使用sub函数替换
                    book['author_name'] = re.sub(r'作者:\s|<br/>', '',
                                                 author_name_line)
                    word_num_line = re.findall(r'字数:\s[\s\S]*?<br/>',
                                               str(div))[0]
                    book['word_num'] = re.sub(r'字数:\s|<br/>', '',
                                              word_num_line).strip()
                    updated_time_line = re.findall(r'最后更新:\s[\s\S]*?<br/>',
                                                   str(div))[0]
                    book['updated_time'] = re.sub(r'最后更新:\s|<br/>', '',
                                                  updated_time_line).strip()

            books.append(book)
    except Exception as e:
        print(e)
Exemplo n.º 38
0
def get_latest(package, last_checked=None):
    """
        Get the latest version of a package
    """
    json_data = fetch_page('http://rubygems.org/api/v1/versions/%s.json' %
                           package,
                           last_modified=last_checked)

    if json_data is None:  # NB: empty string is not None but will fail the check
        return 304, 304

    data = json.loads(json_data)

    for version in data:  # i in range(0, len(data)):
        if version['prerelease']:
            continue
        else:
            return version['number'], get_release_date(package,
                                                       version['number'], data)

    # in case there are only pre-release versions
    return None, None
Exemplo n.º 39
0
def get_author_from_html(package, version):
    """
        Try to parse the HTML page and get the author
        for an older version.
    """
    package = _other_name(package)
    package = package.replace('::', '-')

    html = fetch_page('https://metacpan.org/release/%s' % package).split('\n')
    options = []
    for line in html:
        if line.find('<option value=') > -1:
            options.append(line.strip())

    # format is
    # <option value="ADAMK/PPI-HTML-0.01/">0.01 (2005-01-15)</option>
    for opt in options:
        author = opt.split('"')[1].split('/')[0]
        ver = opt.split('>')[1].split(' ')[0]
        if ver == version:
            return author

    return None
Exemplo n.º 40
0
    rx = "#\d+"  # repeat so we can get a clean expression
    for e in exprs:
        rx = rx + "|" + e


#    print rx

    return re.compile(rx)

if __name__ == "__main__":
    from utils import fetch_page
    # NB: order is the same as in extract_title_and_dates_from_html()

    print extract_title_and_dates_from_html(
        fetch_page('https://github.com/cowboyd/therubyracer/pull/240'),
        BUG_TYPE_GITHUB)
    print extract_title_and_dates_from_html(
        fetch_page('https://github.com/boto/boto/issues/1757'),
        BUG_TYPE_GITHUB)
    print extract_title_and_dates_from_html(
        fetch_page('https://github.com/marcandre/backports/pull/63'),
        BUG_TYPE_GITHUB)
    print extract_title_and_dates_from_html(
        fetch_page('https://github.com/boto/boto/pull/407'), BUG_TYPE_GITHUB)
    print extract_title_and_dates_from_html(
        fetch_page('https://github.com/hmarr/django-ses/pull/40'),
        BUG_TYPE_GITHUB)
    print extract_title_and_dates_from_html(
        fetch_page('https://github.com/schacon/hg-git/issues/31'),
        BUG_TYPE_GITHUB)
Exemplo n.º 41
0
#    print extract_title_and_dates_from_html(fetch_page('https://github.com/hmarr/django-ses/pull/40'), BUG_TYPE_GITHUB)
#    print extract_title_and_dates_from_html(fetch_page('https://github.com/schacon/hg-git/issues/31'), BUG_TYPE_GITHUB)

#    print extract_title_and_dates_from_html(fetch_page('https://bugzilla.redhat.com/show_bug.cgi?id=800754'), BUG_TYPE_BUGZILLA)

#    print extract_title_and_dates_from_html(fetch_page('https://bitbucket.org/birkenfeld/pygments-main/issue/763'), BUG_TYPE_BITBUCKET)
#    print extract_title_and_dates_from_html(fetch_page('https://bitbucket.org/birkenfeld/pygments-main/issue/861'), BUG_TYPE_BITBUCKET)

#    print extract_title_and_dates_from_html(fetch_page('https://bugs.launchpad.net/pytz/+bug/207604'), BUG_TYPE_LAUNCHPAD)

#    print extract_title_and_dates_from_html(fetch_page('http://code.google.com/p/pysqlite/issues/detail?id=11'), BUG_TYPE_GOOGLE)
#    print extract_title_and_dates_from_html(fetch_page('http://code.google.com/p/pysqlite/issues/detail?id=23'), BUG_TYPE_GOOGLE)
#    print extract_title_and_dates_from_html(fetch_page('http://code.google.com/p/geopy/issues/detail?id=2'), BUG_TYPE_GOOGLE)


    print extract_title_and_dates_from_html(fetch_page('http://foolscap.lothar.com/trac/ticket/204'), BUG_TYPE_TRAC)
    print extract_title_and_dates_from_html(fetch_page('https://code.djangoproject.com/ticket/18436'), BUG_TYPE_TRAC)


#    print extract_title_and_dates_from_html(fetch_page('http://bugs.repoze.org/issue4'),  BUG_TYPE_ROUNDUP)
#    print extract_title_and_dates_from_html(fetch_page('http://bugs.repoze.org/issue85'), BUG_TYPE_ROUNDUP)
#    print extract_title_and_dates_from_html(fetch_page('http://bugs.repoze.org/issue43'), BUG_TYPE_ROUNDUP)

#    print extract_title_and_dates_from_html(fetch_page('http://sourceforge.net/tracker/?func=detail&aid=3552403&group_id=38414&atid=422030'), BUG_TYPE_SOURCEFORGE)
#    print extract_title_and_dates_from_html(fetch_page('http://sourceforge.net/p/pydev/bugs/1558/'), BUG_TYPE_SOURCEFORGE)

#    print extract_title_and_dates_from_html(fetch_page('http://psycopg.lighthouseapp.com/projects/62710/tickets/83'), BUG_TYPE_LIGHTHOUSE)
#    print extract_title_and_dates_from_html(fetch_page('http://psycopg.lighthouseapp.com/projects/62710/tickets/78'), BUG_TYPE_LIGHTHOUSE)
#    print extract_title_and_dates_from_html(fetch_page('http://psycopg.lighthouseapp.com/projects/62710/tickets/146'), BUG_TYPE_LIGHTHOUSE)
#    print extract_title_and_dates_from_html(fetch_page('http://psycopg.lighthouseapp.com/projects/62710/tickets/112'), BUG_TYPE_LIGHTHOUSE)