def auto_generate(sampleurl,data,common_url): s = Scraper() s.train(sampleurl, data) res = (s.scrape(common_url))[0] for k,v in res.items(): res[k] = v[0].replace('\n', '').strip() return res
def test_train_store_load_scrape(self): url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/" data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"} s = Scraper() s.train(url1, data, encoding="latin1") f = StringIO() s.tofile(f) f.seek(0) s = Scraper.fromfile(f) url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/" data = s.scrape(url2, encoding="latin1") self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])
from scrapely import Scraper import sys import json try: scrape_site = sys.argv[1] except: print 'Invalid arguements. Usage python scrape.py <site-name>' sys.exit(2) print 'Training the scraper with existing data-set' s = Scraper() result = {} train_data = json.loads(open(scrape_site + '_train.json', 'r').read()) for data in train_data: s.train(data['url'], {'name': data['title']}) test_data = json.loads(open(scrape_site + '_tests.json', 'r').read()) for data in test_data: result.update(s.scrape(data['url'])) open(scrape_site + '_result.json', 'w').write(json.dumps(result))
""" @author: 挖掘机小王子 @contact: [email protected] @software: PyCharm @file: test.py @time: 2019/12/6 11:53 @desc: """ from scrapely import Scraper import requests scraper = Scraper() url = 'https://www.ituring.com.cn/article' data = {'name': 'duxingzhe', 'title': '这两天的面试经验总结'} # response = requests.get(url).text scraper.train(url, data) result = scraper.scrape(url, encoding='utf-8') print(result)
OR = Namespace("http://purl.org/wikibus/omnibusrevue/bus/") FOAF = Namespace("http://xmlns.com/foaf/0.1/") def CreateGraph(busId, busData): graph = Graph() busRes = OR[busId] graph.add((busRes, RDF.type, BUS["Bus"])) graph.add((busRes, FOAF["page"], Literal(busUrlFormatWithName.format(busData[0]['model'][0].encode('utf-8'), busId)))) for key in busData[0]: obj = busData[0][key][0].encode('utf-8') if obj <> "k.A": graph.add((busRes, BUS[key], Literal(obj))) return graph.serialize(format='turtle') busScraper = Scraper() busScraper.train(busUrlFormat % '1120301', exampleData) offset = 0 while True: html = scraperwiki.scrape(catalogUrlFormat % offset) root = lxml.html.fromstring(html) busIds = root.cssselect('input[type=checkbox]') if len(busIds) > 0: for busCheckbox in busIds: busUrl = busUrlFormat % busCheckbox.attrib['value'] busGraph = CreateGraph(busCheckbox.attrib['value'], busScraper.scrape(busUrl)) dataStored = {'url': busUrl, 'graph': busGraph} scraperwiki.sqlite.save(unique_keys=['url'], data=dataStored) offset += 20 else: breakimport scraperwiki
from scrapely import Scraper s = Scraper() url1 = 'http://movie.douban.com/subject/1292063/' data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'} s.train(url1, data1) url2 = 'http://movie.douban.com/subject/1291560/' # s.scrape(url2) data2 = {'name': u'龙猫 となりのトトロ', 'author': u'宫崎骏', 'time': '1988-04-16'} s.train(url2, data2) url3 = 'http://movie.douban.com/subject/1293839/' data3 = {'name': u'罗马假日 Roman Holiday', 'author': u'威廉·惠勒', 'time': '1953-08-27'} # s.scrape(url3) s.train(url3, data3) url4 = 'http://movie.douban.com/subject/1292224/' s.scrape(url4) from scrapely import Scraper s = Scraper() url1 = 'http://movie.douban.com/subject/1292063/' data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'} s.train(url1, data1) url4 = 'http://movie.douban.com/subject/1292224/' s.scrape(url4) # with open('11.txt','wb') as afile: # s.tofile(afile)
def create_data(self): training_url = "http://www.wholesalegaming.biz/startrek/trekalphastarterbox/" data_training = {"product": "Star Trek Alpha Unlimited Starter Box", "price": "$15.00"} #train scrapely scraper = Scraper() scraper.train(training_url, data_training) #get the URLs to check page_json = file("pages_to_check.json").read() #format (all strings in unicode) : {"urls" : [ <url1 string>, <url2 string>, ... , <urln string> ] } urls_to_check = json.loads(page_json) #get data #dictionary with "product name": "price" price_list = {} for each_url in urls_to_check["urls"]: scraped_data = scraper.scrape(each_url) #example of a scraped data: [{u'price': [u' $15.00 '], u'product': [u'Star Trek Alpha Unlimited Starter Box']}] #let's sanitize the price to a float and make this a dictionary entry dollar_string = scraped_data[0]["price"][0].replace(" ","") removed_dollar_sign = dollar_string.replace("$", "") try: price_as_float = float(removed_dollar_sign) except ValueError: #If the value gotten isn't convertable to a float, then it #most likely is "Product Unavailable" and we need to deal #with this case later down. N/A will be our tell for that. price_as_float = "N/A" #get the product name by itself. product_name = scraped_data[0]["product"][0] #now add the sanitized price and product name to price list price_list[product_name] = [price_as_float, each_url] #Create a json file of the prices timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) with open("/tmp/prices-%s.json" % timestamp, "w") as fp: json.dump(price_list, fp, sort_keys=True, indent=4) #Compare this price list to the most "recent" price list recent_price_list = {} with open('/tmp/prices-recent.json', 'r') as fp: recent_price_list = json.load(fp) #This will be the output data of comparing the old data and new data #format: { # "product_one_name": # { # "old_price": <float> # "new_price": <float>, # "new_difference": <float of new price - old price>, # "is_difference": <boolean>, # "is_new_product": <boolean>, # "is_discontinued_product": <boolean> # }, # "product_two_name":... # comparison_data = {} for old_product, old_price in recent_price_list.iteritems(): new_difference = 0.0 is_difference = False is_new_product = False is_discontinued_product = False try: new_price = price_list[old_product] new_difference = new_price[0] - old_price[0] except(KeyError, TypeError): #take care of the case that old_product doesn't appear on price_list #This also takes care of the case the the old_price isn't a float because #the old price is marked as N/A new_price = [0.0] is_discontinued_product = True if new_difference != 0.0: is_difference = True comparison_data[old_product] = { "old_price": old_price[0], "new_price": new_price[0], "new_difference": new_difference, "is_difference": is_difference, "is_new_product": False, "is_discontinued_product": is_discontinued_product, "product_url": old_price[1] } #find all items on price_list that is not in recent_price_list new_inventory_set = set(price_list.keys()) - set(recent_price_list.keys()) new_inventory_list = list(new_inventory_set) for each_product in new_inventory_list: comparison_data[each_product] = { "old_price": 0.0, "new_price": price_list[each_product][0], "new_difference": price_list[each_product][0], "is_difference": True, "is_new_product": True, "is_discontinued_product": False, "product_url": price_list[each_product][1] } #makes it easy to find the always most recent data with open("/tmp/price-comparison-recent.json", "w") as fp: json.dump(comparison_data, fp, sort_keys=True, indent=4) #update the recent prices with open("/tmp/prices-recent.json", "w") as fp: json.dump(price_list, fp, sort_keys=True, indent=4) #Create a file to be the most recent comparison data timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) if "True" in comparison_data: filename = "/tmp/price-comparison-%s.json" with open(filename, "w") as fp: json.dump(comparison_data, fp, sort_keys=True, indent=4) return filename return None
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2020/3/3 0:27 # @Author : tanxw # pip install scrapely from scrapely import Scraper s = Scraper() train_url = 'http://example.webscraping.com/places/default/view/Afghanistan-1' s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'}) test_url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239' s.scrape(test_url)
from scrapely import Scraper import sys import json try: scrape_site=sys.argv[1] except: print 'Invalid arguements. Usage python scrape.py <site-name>' sys.exit(2) print 'Training the scraper with existing data-set' s=Scraper() result={} train_data=json.loads(open(scrape_site+'_train.json','r').read()) for data in train_data: s.train( data['url'],{'name':data['title']}) test_data=json.loads(open(scrape_site+'_tests.json','r').read()) for data in test_data: result.update(s.scrape(data['url'])) open(scrape_site+'_result.json','w').write(json.dumps(result))
from scrapely import Scraper s = Scraper() url = "" data = {} s.train(url, data)
def scrapely_test(): s = Scraper() train_url = 'http://example.python-scraping.com/view/Afghanistan-1' s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'}) test_url = 'http://example.python-scraping.com/view/United-Kingdom-239' print(s.scrape(test_url))
def update_scrapers_file(url): domain = re.search(r'(?<=\/\/)[\w\.-]+(?=\/)', url).group() scraper_file_name = "" scrapers_json = {} with open('scrapers.json', 'r') as scrapers_file: scrapers_json = json.load(scrapers_file) scraper_file_name = domain + ".json" scrapers_json[domain] = scraper_file_name with open('scrapers.json', 'w') as scrapers_file: json.dump(scrapers_json, scrapers_file) return scraper_file_name # TODO add help and verbose modes # TODO add arg validation and error feedback scraper = Scraper() training_params = open_training_file() assert training_params, "no training parameters found in {}".format( sys.argv[1]) url = training_params['url'] params = training_params['params'] scraper.train(url, params) # TODO replace this with database action and maybe do checksum compare to avoid writing same scraper more than once? scraper_file_name = update_scrapers_file(url) with open(scraper_file_name, 'w') as scraper_file: scraper.tofile(scraper_file)
from scrapely import Scraper from urls import urls h = html2text.HTML2Text() s = Scraper() # train url1 = 'http://www.coca-colaitalia.it/storie/il-primo-ingrediente-dei-nostri-prodotti-e-lacqua' data = { 'title': 'Il primo ingrediente dei nostri prodotti è l’acqua. Ecco come lo preserviamo', 'text': '<div id="article">', 'author': 'Redazione Journey', 'date': '22 mar 2017' } s.train(url1, data) # file opener file_wht = open('test.csv', "wb") writer = csv.writer(file_wht, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow(("Titolo", "Testo", "Autore", "Data")) # get stuff for item in urls: try: content = s.scrape(item)[0] title = h.handle(content["title"][0]).encode('utf-8') parsed_text = h.handle(content["text"][0]).encode('utf-8')
'category': '类别:Linux' } if len(sys.argv) > 1: url2=sys.argv[1] else: url2='fa2ebd45db2fd724cefca317.html' #import pprint ##pp = pprint.Prettyprint(indent=2) #pprint.pprint(d) #print d[0]['title'][0] #print d[0]['category'][0] #print d[0]['date'][0] s.train(url1, data) import os data = {} #for dirname, dirnames, filenames in os.walk('../utf8/'): # for filename in filenames: for fn in os.listdir('../utf8/'): print fn url2 = '../utf8/' + fn d = s.scrape(url2) try: data[fn] = { 'title': unicode(d[0]['title'][0]),