def get_emails(website, max_depth): """Returns a list of emails found at this website max_depth is how deep to follow links """ D = download.Download() return D.get_emails(website, max_depth=max_depth)
def EmailScrape(email): D = download.Download() emails = D.get_emails("{}".format(email), max_depth=1, max_urls=None, max_emails=25) return emails
def download_locations(): """Find latitude longitude bounding box for this country """ D = download.Download(num_retries=1) index_url = 'http://download.geonames.org/export/zip/' index_html = D.get(index_url) for link in xpath.search(index_html, '//pre/a/@href'): if link.endswith( '.zip') and '_full' not in link and 'allCountries' not in link: download_html = D.get(urlparse.urljoin(index_url, link)) input_zip = StringIO.StringIO() input_zip.write(download_html) try: tsv_data = zipfile.ZipFile(input_zip).read( link.replace('.zip', '.txt')) except zipfile.BadZipfile as e: print e del D.cache[urlparse.urljoin(index_url, link)] continue output_filename = link.replace('.zip', '_locations.csv') writer = csv.writer(open(output_filename, 'w')) found = set() for row in csv.reader(tsv_data.splitlines(), delimiter='\t'): zip_code = row[1] = row[1].split('-')[0] try: lat, lng = float(row[9]), float(row[10]) except ValueError: print 'bad coord:', row[9], row[10] else: if lat and lng and zip_code not in found: found.add(zip_code) place = row[2] writer.writerow([place, zip_code, lat, lng]) print 'Downloaded to', output_filename
def politician_and_org(politician_name): wiki_url = 'http://en.wikipedia.org/wiki/%s' %'_'.join(politician_name.split()) html = download.Download().fetch(wiki_url) html = common.remove_tags(BeautifulSoup(html, "lxml").text) html = " ".join(html.split()).lower() doc = nlp(html) return doc.ents
def extract(url): ''' Function that extracts product info from websites listed in the csv page . It takes the url as an argument. ''' try: url = url.encode('utf-8') D = download.Download() try: xpath_input_file = open(os.path.join(os.path.dirname(__file__), 'webpage_xpath.csv'), 'rb') # Joining absolute path so that the function can be anywhere except IOError: # Checking for IO exceptions, i.e if the file exists or not print("An error occured while reading the csv file,\ check your Directory again") sys.exit() reader = csv.reader(xpath_input_file) row = list(reader) item_info = {} for r in range(0, len(row)): if url.find(row[r][0]) >= 0 and url.find(row[r][4]) >= 0: # Checks if the url fiven is correct or not # Fails in the case the url is of given site but not a prduct url xpath1 = row[r][1] xpath2 = row[r][2] xpath3 = row[r][3] html = D.get(url) # Webpage downloads after validation item_info['name'] = xpath.get(html, '%s//text()' % xpath1).strip() item_info['price'] = xpath.get(html, '%s//text()' % xpath2) item_info['image'] = xpath.get(html, '%s' % xpath3).strip() return item_info else: continue if item_info == {}: raise InvalidurlError("Enter a valid product url") sys.exit() xpath_input_file.close() sys.exit() except KeyboardInterrupt: raise sys.exit()
def __init__(self, name): self.name = name self.normalizedName = self.name.lower().replace(' ', '-') self.url = '%s/%s/bushalte-%s' % (baseURL, place, self.normalizedName) self.downloader = download.Download() jsonFile = open(jsonName % self.normalizedName, 'r+') jsonString = json.load(jsonFile) jsonFile.close() self.id = jsonString['busstop-id'] self.lines = dict() for line in jsonString['lines']: self.lines[str(line['number']) + line['destination']] = BusLine( self, line) self.update()
def scrape_title(num, typ): f = open('title_%s.txt' % typ, 'w') D = download.Download(read_cache=False) key = MAN if typ == 'MAN' else QASH url = 'https://etherscan.io/token/%s' % key html = D.get(url) ts = common.regex_get(html, r'Total\sSupply\:[^<]*</td>[^<]*<td>([^<]+)<') vt = common.regex_get(html, r'Value\sper\sToken\:[^<]*</td>[^<]*<td>([^<]+)<') th = common.regex_get(html, r'Token\sHolders\:[^<]*</td>[^<]*<td>([^<]+)<') f.write('Total Supply: %s\n' % ts) f.write('Value per Token: %s\n' % vt) f.write('Token Holders: %s\n' % th) f.write('No.Of.Transfers: %s\n' % num)
def scrapeBB(gamename): BB = download.Download(user_agent=None) search = gamename search = search.replace(" ", "+") html = BB.fetch( "http://www.bestbuy.com/site/searchpage.jsp?_dyncharset=UTF-8&id=pcat17071&type=page&ks=960&st={}&sc=Global&cp=1&sp=&qp=category_facet%3DVideo+Games~abcat0700000&list=y&usc=All+Categories&nrp=15&iht=n&seeAll=" .format(search)) if not html: nobb = 1 print("Couldn't connect to Best Buy's servers.") return nobb gametitle = xpath.search(html, '//h3[@itemprop="name"]//a') productlinks = xpath.search(html, '//h3[@itemprop="name"]//a/@href') gameprice = xpath.search(html, '//span[@itemprop="price"]') return (gametitle, productlinks, gameprice)
def scrapeGamestop(gamename): GS = download.Download() search = gamename search = search.replace(" ", "+") html = GS.fetch( "http://www.gamestop.com/browse?nav=16k-3-{},28zu0".format(search)) if not html: nogs = 1 print("Couldn't connect to Gamestop's servers.") return nogs gametitle = xpath.search(html, '//div[@class="product_info grid_12"]//a[1]') productlinks = xpath.search( html, '//div[@class="product_info grid_12"]//a[1]/@href') gameprice = xpath.search(html, '//p[@class="pricing"]') return (gametitle, productlinks, gameprice)
def scrapeAmazon(gamename): AMA = download.Download(user_agent=None) search = gamename search = search.replace(" ", "+") html = AMA.fetch( "http://www.amazon.com/gp/search/ref=sr_il_ti_videogames?rh=n%3A468642%2Ck%3A{}&keywords={}&ie=UTF8&qid=1407988315&lo=videogames" .format(search, search)) if not html: noamazon = 1 print("Couldn't connect to Amazon's servers.") return noamazon gametitle = xpath.search( html, '//div[@class="ilt3"]//a//span[@class="lrg bold"]') productlinks = xpath.search(html, '//div[@class="ilt3"]//a/@href') gameprice = xpath.search(html, '//div[@class="ill3"]//span[@class="red bld"]') return (gametitle, productlinks, gameprice)
def incr_database(conn): # csi D = download.Download(delay=0, read_cache=None, write_cache=None) data = [] csi = [] src = 'http://www.csindex.com.cn/zh-CN/indices/index-detail/' for i in open('stocks.csv'): code = i.split('\t')[0] if 'CSI' in i or '000985' in i: url = src + code html = D.get(url) trddate = common.regex_get(html, r'截止日期:([^<]+)<') if trddate: trddate = trddate.replace('-', '') m = xpath.search(html, r'//table[@class="table\stc"]/tr/td', remove=None) close = m[0] if m else None change = m[1] if m and len(m) > 1 else None sql = ''' REPLACE INTO quote_csi(code, close, date, chg) VALUES('%s',%s,%s,%s); ''' % (code, close, trddate, change) conn.execute(sql) else: today = datetime.today().strftime('%Y-%m-%d') engine = create_engine( 'mysql://*****:*****@localhost:3306/dige', echo=False) try: df = ts.get_k_data(code, ktype='D', index=True, start=today, end=today) if not df.empty: sql = ''' delete from quote_nocsi where code like '%%%s%%' and date = '%s' ''' % ( code, today) conn.execute(sql) df.to_sql('quote_nocsi', engine, if_exists='append') except Exception, e: print e
from webscraping import download D = download.Download() emails = D.get_emails("http://buklijas.info/", max_depth=2, max_urls=None, max_emails=None) print(emails)
from webscraping import download, alg Dobj = download.Download() html = Dobj.get("http://www.sharing55tories.blogspot.com/",max_depth=1000, max_urls=None, max_emails=None) emails = alg.extract_emails(html) print emails
import os import sys from optparse import OptionParser import re import datetime import webbrowser from webscraping import common, download, webkit, xpath DELAY = 5 # delay between downloads IMAGE_DIR = 'images' # directory to store screenshots D = download.Download(delay=DELAY, num_retries=1) def historical_screenshots(website, days): """Download screenshots for website since archive.org started crawling website: the website to generate screenshots for days: the number of days difference between archived pages Returns a list of the downloaded screenshots """ # the earliest archived time t0 = get_earliest_crawl(website) print 'Earliest version:', t0 # the current time t1 = datetime.datetime.now() delta = datetime.timedelta(days=days) wb = webkit.WebkitBrowser(gui=True, enable_plugins=True, load_images=True)
def downloadPage(self, url): D = download.Download(read_cache=False, write_cache=False) return D.get(url)