def main(): argparser = configargparse.ArgumentParser( description="AWIS API Proof of Concept") argparser.add_argument('--key-id', required=True) argparser.add_argument('--secret-key', required=True) argparser.add_argument('--sites', required=True, nargs='+') args = argparser.parse_args() client = AwisApi(args.key_id, args.secret_key) tree = client.url_info(args.sites, "Rank", "LinksInCount", "Speed") print etree_tostring(tree) print "client ns_prefixes: ", client.NS_PREFIXES alexa_prefix = client.NS_PREFIXES['alexa'] awis_prefix = client.NS_PREFIXES['awis'] elem = tree.find('//{%s}StatusCode' % alexa_prefix) assert elem.text == 'Success' for elem_result in tree.findall('//{%s}UrlInfoResult' % awis_prefix): # print etree_tostring(elem_result) print "elem_result tag: %s, text: %s" % (elem_result.tag, elem_result.text) tree_result = ElementTree(elem_result) elem_url = tree_result.find('//{%s}DataUrl' % awis_prefix) if elem_url is not None: print "elem_url tag: %s, text: %s" % (elem_url.tag, elem_url.text) elem_metric = tree_result.find('//{%s}Rank' % awis_prefix) if elem_metric is not None: print "elem_metric tag: %s, text: %s " % (elem_metric.tag, elem_metric.text)
def get_metrics(cls, domains, metrics, options): awis_client = AwisApi(options.key_id, options.secret_key) tree = awis_client.url_info(domains, *metrics) alexa_prefix = awis_client.NS_PREFIXES['alexa'] awis_prefix = awis_client.NS_PREFIXES['awis'] elem = tree.find('//{%s}StatusCode' % alexa_prefix) if elem.text != 'Success': raise UserWarning('unable to get metrics: %s' % etree_tostring(tree)) metric_values = [] elems_results = enumerate( tree.findall('//{%s}UrlInfoResult' % awis_prefix)) for result_count, elem_result in elems_results: # print("UrlInfoResult Elem: %s" % etree_tostring(elem_result)) # print("elem_result tag: %s, text: %s" % (elem_result.tag, elem_result.text)) tree_result = ElementTree(elem_result) domain = None elem_url = tree_result.find('//{%s}DataUrl' % awis_prefix) if elem_url is not None: # print("elem_url tag: %s, text: %s" % (elem_url.tag, elem_url.text)) domain = elem_url.text if domain[-1] == "/": domain = domain[:-1] assert domain == domains[result_count], \ "sanity check %s == %s" % (domain, domains[result_count]) # if domain: # print("getting results for domain %s" % domain) domain_metrics = {} for metric in metrics: elem_metric = tree_result.find('//{%s}%s' % (awis_prefix, metric)) if elem_metric is None: raise UserWarning('unable to find metric within UrlInfoResult: %s' \ % etree_tostring(tree_result)) domain_metrics[metric] = elem_metric.text metric_values.append(domain_metrics) print("success: %s" % metric_values) return metric_values
def start( self, baseUrl ): queryUrl = 'http://' + baseUrl content = {} api = AwisApi(aws_config['accessKeyId'], aws_config['secretAccessKey']) respXml = api.url_info(queryUrl, 'RelatedLinks', 'Categories', 'Rank', 'RankByCountry', 'UsageStats', 'ContactInfo', 'Speed', 'Language', 'Keywords', 'OwnedDomains', 'LinksInCount', 'SiteData' ) xml = etree.tostring( respXml, encoding = 'UTF-8' ) respStatus = respXml.find( '//{%s}StatusCode' % api.NS_PREFIXES['alexa'] ).text if 'Success' == respStatus: dom_doc = parseString( xml ) rank_list_items = [] for country in dom_doc.getElementsByTagName( 'aws:Country' ): country_code = country.getAttribute( 'Code' ) country_name = country_name_by_code( country_code ) ranks = country.getElementsByTagName( 'aws:Rank' ) if len( ranks ) > 0 and ranks[0].firstChild is not None: rank = ranks[0].firstChild.nodeValue try: rank_list_items.append( '<li>%(rank)s<sup>th</sup> most visited website in <img src="/images/flags/%(countryCode)s.png" alt="%(countryName)s flag" /> %(countryName)s</li>' % { 'countryCode': country_code.lower(), 'countryName': country_name, 'rank': rank } ) except: pass content['visitorsLocation'] = '<ul>' + ''.join( rank_list_items[:3] ) + '</ul>' related_list_items = [] for related in dom_doc.getElementsByTagName( 'aws:RelatedLink' ): related_url = related.getElementsByTagName( 'aws:NavigableUrl' )[0].firstChild.nodeValue related_title = related.getElementsByTagName( 'aws:Title' )[0].firstChild.nodeValue related_list_items.append( '<li><a href="%s" rel="nofollow" class="external" target="_blank">%s</a></li>' % ( related_url, related_title ) ) content['relatedLinks'] = '<ul>' + ''.join( related_list_items[:5] ) + '</ul>' content['worldRank'] = respXml.find( '//{%s}Rank' % api.NS_PREFIXES['awis'] ).text temp = respXml.find( '//{%s}MedianLoadTime' % api.NS_PREFIXES['awis'] ).text if temp is not None: content['loadTimeMs'] = long( temp ) if int( respXml.find( '//{%s}Percentile' % api.NS_PREFIXES['awis'] ).text ) < 50: pass self.sendAndSaveReport( baseUrl, content )
import os from awis import AwisApi ACCESS_ID = os.environ.get("AWS_ACCESS_ID", None) SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", None) assert ACCESS_ID and SECRET_ACCESS_KEY, "You must set credentials in the environment." api = AwisApi(ACCESS_ID, SECRET_ACCESS_KEY) # tree = api.top_info(count=10, offset=0, path='Top', recursive=True, descriptions=True) tree = api.url_info("www.dailydot.com", "Rank", "LinksInCount")
# AUTHOR: JOHN SKANDALAKIS # USE: This program queries the Alexa API to get the URLs associated with an # alexa category. To use this file you need an alexa api key file as provided # by AWS. The file needs to be located in the same directory from awis import AwisApi import os.path # grab the api key and id from file and create the alexa object ACCESS_ID = None SECRET_ACCESS_KEY = None with open("rootkey.csv", "r") as keyfile: ACCESS_ID = keyfile.readline().split("=")[1].strip() SECRET_ACCESS_KEY = keyfile.readline().split("=")[1].strip() api = AwisApi(ACCESS_ID, SECRET_ACCESS_KEY) # check the list of categories you want to take from alexa with open('categories.csv', 'r') as c: for category in c: category = category.strip() fname = "categories/" + category.replace('/', '-') # check to make sure you haven't already # done this so you don't spend money if os.path.isfile(fname): print fname, "already exists" continue i = 1
# USE: This program queries the Alexa API to get the URLs associated with an # alexa category. To use this file you need an alexa api key file as provided # by AWS. The file needs to be located in the same directory from awis import AwisApi import os.path # grab the api key and id from file and create the alexa object ACCESS_ID = None SECRET_ACCESS_KEY = None with open("rootkey.csv","r") as keyfile: ACCESS_ID = keyfile.readline().split("=")[1].strip() SECRET_ACCESS_KEY = keyfile.readline().split("=")[1].strip() api = AwisApi(ACCESS_ID, SECRET_ACCESS_KEY) # check the list of categories you want to take from alexa with open('categories.csv','r') as c: for category in c: category = category.strip() fname = "categories/"+category.replace('/','-') # check to make sure you haven't already # done this so you don't spend money if os.path.isfile(fname): print fname, "already exists" continue
def test_unicode(): api = AwisApi(os.environ["AWS_ACCESS_ID"], os.environ["AWS_SECRET_ACCESS_KEY"]) tree = api.category_listings("Top/World/Dansk/Børn_og_unge/Kultur") listings = tree.findall(".//awis:Listing", AwisApi.NS_PREFIXES) assert len(listings) > 0
def __enter__(self): self.api = AwisApi(*self.auth) self.session_result = {} self.session_list_of_raw_result = [] return self
class AWISContextManager: def __init__(self, access_id, secret_access_key, workers_count=5): self.auth = (access_id.encode('utf-8'), secret_access_key.encode('utf-8')) self.workers_count = workers_count self.closed = False self.cache = Cache() def __enter__(self): self.api = AwisApi(*self.auth) self.session_result = {} self.session_list_of_raw_result = [] return self def __exit__(self, *args): # Save new values (or update exists) for d, data in self.session_result.items(): self.cache.set(d, data) self.closed = True def url_info(self, domains, *categories): """ Wrapper over `AwisApi.url_info` for parallel processing. """ if self.closed: raise AttributeError('Session is closed') CHUNK_SIZE = 5 # AWIS`s limit def request(domains, categories): logger.info('AWIS request for %s', domains) return self.api.url_info(domains, *categories) with ThreadPoolExecutor(self.workers_count) as executor: tasks = [ executor.submit(request, chunk, categories) for chunk in chunks(CHUNK_SIZE, self.handle_cache(domains)) ] for future in as_completed(tasks): self.session_list_of_raw_result.append(future.result()) def get_value(self, root, path, default=None): """ Shortcut for fetching first node. """ path = self.handle_path(path) return getattr(root.find(path, self.api.NS_PREFIXES), 'text', default) def iter_results(self, path): """ Iterator for filling results for a particular domain. """ path = self.handle_path(path) for tree in self.session_list_of_raw_result: for node in tree.findall(path, self.api.NS_PREFIXES): domain = self.get_value(node, 'DataUrl') result_row = self.session_result.setdefault( domain, defaultdict(lambda: None)) yield domain, node, result_row @staticmethod def handle_path(path): # TODO: compile reg-exp? return './/awis:%s' % (path.strip('.//').strip('/') .replace('/', '/awis:')) def handle_cache(self, domains): """ Filter domains which already in cache and put them into the result. """ for d in domains: cached_value = self.cache.get(d) if cached_value: self.session_result[d] = cached_value continue yield d
def start(self, baseUrl): queryUrl = 'http://' + baseUrl content = {} api = AwisApi(aws_config['accessKeyId'], aws_config['secretAccessKey']) respXml = api.url_info(queryUrl, 'RelatedLinks', 'Categories', 'Rank', 'RankByCountry', 'UsageStats', 'ContactInfo', 'Speed', 'Language', 'Keywords', 'OwnedDomains', 'LinksInCount', 'SiteData') xml = etree.tostring(respXml, encoding='UTF-8') respStatus = respXml.find('//{%s}StatusCode' % api.NS_PREFIXES['alexa']).text if 'Success' == respStatus: dom_doc = parseString(xml) rank_list_items = [] for country in dom_doc.getElementsByTagName('aws:Country'): country_code = country.getAttribute('Code') country_name = country_name_by_code(country_code) ranks = country.getElementsByTagName('aws:Rank') if len(ranks) > 0 and ranks[0].firstChild is not None: rank = ranks[0].firstChild.nodeValue try: rank_list_items.append( '<li>%(rank)s<sup>th</sup> most visited website in <img src="/images/flags/%(countryCode)s.png" alt="%(countryName)s flag" /> %(countryName)s</li>' % { 'countryCode': country_code.lower(), 'countryName': country_name, 'rank': rank }) except: pass content['visitorsLocation'] = '<ul>' + ''.join( rank_list_items[:3]) + '</ul>' related_list_items = [] for related in dom_doc.getElementsByTagName('aws:RelatedLink'): related_url = related.getElementsByTagName( 'aws:NavigableUrl')[0].firstChild.nodeValue related_title = related.getElementsByTagName( 'aws:Title')[0].firstChild.nodeValue related_list_items.append( '<li><a href="%s" rel="nofollow" class="external" target="_blank">%s</a></li>' % (related_url, related_title)) content['relatedLinks'] = '<ul>' + ''.join( related_list_items[:5]) + '</ul>' content['worldRank'] = respXml.find('//{%s}Rank' % api.NS_PREFIXES['awis']).text temp = respXml.find('//{%s}MedianLoadTime' % api.NS_PREFIXES['awis']).text if temp is not None: content['loadTimeMs'] = long(temp) if int( respXml.find('//{%s}Percentile' % api.NS_PREFIXES['awis']).text) < 50: pass self.sendAndSaveReport(baseUrl, content)