def fetch_webpage_text(url, use_cache=True):
    #set session
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    #use session to retrive data
    #site= "https://xtools.wmflabs.org/articleinfo/en.wikipedia.org/Black%20Lives%20Matter"
    hdr = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'
    }

    url = url.encode('utf-8')
    if use_cache and cache.contains(url):
        return cache.get(url)

    req = session.get(url, headers=hdr)  #{'User-Agent': 'Mozilla/5.0'}
    content = req.text
    cache.put(url, content)
    time.sleep(1)
    return content
Exemplo n.º 2
0
def fetch_webpage(url, use_cache=True):
    key = cache.md5_key(url)
    if use_cache and cache.contains(key):
        return cache.get(key)
    content = requests.get(url).text
    cache.put(key,content)
    return content
Exemplo n.º 3
0
def fetch_webpage_text(url, use_cache=True):
    if use_cache and cache.contains(url):
        return cache.get(url)
    # cache miss, download it
    content = requests.get(url).text
    cache.put(url, content)
    return content
Exemplo n.º 4
0
def fetch_webpage_text(url,use_cache=True):
	if use_cache and cache.contains(url):
		return cache.get(url)
	# cache miss, download it
	content = requests.get(url).text
	cache.put(url,content)
	return content
Exemplo n.º 5
0
def fetch_webpage_text(url, use_cache=True):
    if use_cache and cache.contains(url):
        return cache.get(url)
    # if cache miss, download it and sleep one second to prevent too-frequent calls
    content = requests.get(url).text
    cache.put(url, content)
    time.sleep(1)
    return content
Exemplo n.º 6
0
def fetch_webpage_text(url, use_cache=True):
    if use_cache and cache.contains(url):
        return cache.get(url)
    # if cache miss, download it and sleep one second to prevent too-frequent calls
    content = requests.get(url).text
    cache.put(url,content)
    time.sleep(1)
    return content
Exemplo n.º 7
0
def get_friends(session=None, target=None):
    starget = str(target)
    if cache.contains(starget):
        return cache.get(starget)
    else:
        try:
            friends = session.friends.get(user_id=target)["items"]
        except vk_api.exceptions.ApiError:
            print("[!] ID " + str(target) + " is private. Skipping")
            return -1
        cache.add(starget, friends)
        return friends
Exemplo n.º 8
0
import random
import string
import cache


def random_string(length):
    s = ''
    for i in range(length):
        s = s + random.choice(string.ascii_letters)
        return s


cache.init()

for n in range(1000):
    while True:
        key = random_string(20)
        if cache.contains(key):
            continue
        else:
            break
    value = random_string(20)
    cache.set(key, value)
    print("After {} iterations, cache has {} entries".format(n+1, cache.size()))
    print(key)


import cache

config = ConfigParser.ConfigParser()
config.read("settings.cfg")

geonames_username = config.get("geonames","username")
print "Talking to Geonames as "+geonames_username
GEONAMES_API_URL = "http://api.geonames.org/searchJSON"

print "Finding largest cities in country:"
alpha3_to_city = {}
for country in iso3166.countries:
    print "  "+country.name
    cache_key = country.alpha3+"-geocode"
    results_text = None
    if cache.contains(cache_key):
        results_text = cache.get(cache_key)
    else:
        response = requests.get(GEONAMES_API_URL,
            params={ 'country':country.alpha2, 
              'q':country.name.split(",")[0],
              'username':geonames_username})
        results_text = response.content
        cache.put(cache_key,results_text)
    results = json.loads(results_text)
    try:
        cities = sorted([place for place in results['geonames'] if "PPL" in place['fcode']], key=itemgetter('population'),reverse=True)
    except KeyError:
        print "Error! Couldn't find an fcodes"
        continue
    if len(cities)>0:
# test_cache.py

import random
import string
import cache

def random_string(length):
    s = ''
    for i in range(length):
        s = s + random.choice(string.ascii_letters)
    return s

cache.init()

for n in range(1000):
    while True:
        key = random_string(20)
        if cache.contains(key):
            continue
        else:
            break
    value = random_string(20)
    cache.set(key, value)
    print("After {} iterations, cache has {} entries".format(n+1, cache.size()))

import cache

config = ConfigParser.ConfigParser()
config.read("settings.cfg")

geonames_username = config.get("geonames", "username")
print "Talking to Geonames as " + geonames_username
GEONAMES_API_URL = "http://api.geonames.org/searchJSON"

print "Finding largest cities in country:"
alpha3_to_city = {}
for country in iso3166.countries:
    print "  " + country.name
    cache_key = country.alpha3 + "-geocode"
    results_text = None
    if cache.contains(cache_key):
        results_text = cache.get(cache_key)
    else:
        response = requests.get(GEONAMES_API_URL,
                                params={
                                    'country': country.alpha2,
                                    'q': country.name.split(",")[0],
                                    'username': geonames_username
                                })
        results_text = response.content
        cache.put(cache_key, results_text)
    results = json.loads(results_text)
    try:
        cities = sorted([
            place for place in results['geonames'] if "PPL" in place['fcode']
        ],
Exemplo n.º 12
0
        4: 'duration',
        5: 'summary',
        6: 'date_posted'
    }.get(x, 0)


# set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# let's scrape
url = BASE_URL + START_PAGE
logger.info("Scraping UFO reports from %s" % url)

# first grab the index page
if not cache.contains(url):
    index_page = requests.get(url)
    logger.debug("\tadded to cache from %s" % url)
    cache.put(url, index_page.text)
content = cache.get(url)

# now pull out all the links to songs
dom = BeautifulSoup(content)

#/html/body/p/table/tbody/tr[1]/td[1]/font/a

link_tags = dom.select("td a")
logger.debug("\tfound %d link tags" % len(link_tags))
links = set([tag['href'] for tag in link_tags])  # get all the unique urls
logger.info("\tfound %d links to UFO shapes" % len(links))