Пример #1
0
def auto_generate(sampleurl,data,common_url):
    s = Scraper()
    s.train(sampleurl, data)
    res = (s.scrape(common_url))[0]
    for k,v in res.items():
        res[k] = v[0].replace('\n', '').strip()
    return res
Пример #2
0
    def test_train_store_load_scrape(self):
        url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/"
        data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"}
        s = Scraper()
        s.train(url1, data, encoding="latin1")

        f = StringIO()
        s.tofile(f)

        f.seek(0)
        s = Scraper.fromfile(f)

        url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/"
        data = s.scrape(url2, encoding="latin1")
        self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])
Пример #3
0
from scrapely import Scraper
import sys
import json
try:
    scrape_site = sys.argv[1]
except:
    print 'Invalid arguements. Usage python scrape.py <site-name>'
    sys.exit(2)
print 'Training the scraper with existing data-set'
s = Scraper()
result = {}
train_data = json.loads(open(scrape_site + '_train.json', 'r').read())
for data in train_data:
    s.train(data['url'], {'name': data['title']})
test_data = json.loads(open(scrape_site + '_tests.json', 'r').read())
for data in test_data:
    result.update(s.scrape(data['url']))
open(scrape_site + '_result.json', 'w').write(json.dumps(result))
Пример #4
0
"""
    @author: 挖掘机小王子
    @contact: [email protected]
    @software: PyCharm
    @file: test.py
    @time: 2019/12/6 11:53
    @desc:
"""
from scrapely import Scraper
import requests


scraper = Scraper()

url = 'https://www.ituring.com.cn/article'
data = {'name': 'duxingzhe', 'title': '这两天的面试经验总结'}
# response = requests.get(url).text
scraper.train(url, data)
result = scraper.scrape(url, encoding='utf-8')
print(result)
Пример #5
0
        if obj <> "k.A":
            graph.add((busRes, BUS[key], Literal(obj)))
    return graph.serialize(format='turtle')

busScraper = Scraper()
busScraper.train(busUrlFormat % '1120301', exampleData)

offset = 0
while True:
    html = scraperwiki.scrape(catalogUrlFormat % offset)
    root = lxml.html.fromstring(html)
    busIds = root.cssselect('input[type=checkbox]')
    if len(busIds) > 0:
        for busCheckbox in busIds:
            busUrl = busUrlFormat % busCheckbox.attrib['value']
            busGraph = CreateGraph(busCheckbox.attrib['value'], busScraper.scrape(busUrl))
            dataStored = {'url': busUrl, 'graph': busGraph}
            scraperwiki.sqlite.save(unique_keys=['url'], data=dataStored)
        offset += 20
    else:
        breakimport scraperwiki           
import lxml.html
from scrapely import Scraper
from rdflib import RDF
from rdflib.graph import Graph
from rdflib import Literal, BNode, Namespace

catalogUrlFormat ='http://www.omnibusrevue.de/buskatalog-578829.html?skip=%d'
busUrlFormat = 'http://www.omnibusrevue.de/bus-%s.html'
busUrlFormatWithName = 'http://www.omnibusrevue.de/{0}-{1}.html'
Пример #6
0
from scrapely import Scraper
s = Scraper()
url1 = 'http://movie.douban.com/subject/1292063/'
data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'}
s.train(url1, data1)

url2 = 'http://movie.douban.com/subject/1291560/'
# s.scrape(url2)
data2 = {'name': u'龙猫 となりのトトロ', 'author': u'宫崎骏', 'time': '1988-04-16'}
s.train(url2, data2)

url3 = 'http://movie.douban.com/subject/1293839/'
data3 = {'name': u'罗马假日 Roman Holiday', 'author': u'威廉·惠勒', 'time': '1953-08-27'}
# s.scrape(url3)
s.train(url3, data3)

url4 = 'http://movie.douban.com/subject/1292224/'
s.scrape(url4)


from scrapely import Scraper
s = Scraper()
url1 = 'http://movie.douban.com/subject/1292063/'
data1 = {'name': u'美丽人生 La vita è bella', 'author': u'罗伯托·贝尼尼', 'time': '1997-12-20'}
s.train(url1, data1)

url4 = 'http://movie.douban.com/subject/1292224/'
s.scrape(url4)
# with open('11.txt','wb') as afile:
# 	s.tofile(afile)
Пример #7
0
    def create_data(self):
        training_url = "http://www.wholesalegaming.biz/startrek/trekalphastarterbox/"
        data_training = {"product": "Star Trek Alpha Unlimited Starter Box", "price": "$15.00"}

        #train scrapely
        scraper = Scraper()

        scraper.train(training_url, data_training)

        #get the URLs to check

        page_json = file("pages_to_check.json").read()

        #format (all strings in unicode) : {"urls" : [ <url1 string>, <url2 string>, ... , <urln string> ] }
        urls_to_check = json.loads(page_json)

        #get data

        #dictionary with "product name": "price"
        price_list = {}

        for each_url in urls_to_check["urls"]:
            scraped_data = scraper.scrape(each_url)
            #example of a scraped data: [{u'price': [u'&nbsp;$15.00&nbsp;'], u'product': [u'Star Trek Alpha Unlimited Starter Box']}]

            #let's sanitize the price to a float and make this a dictionary entry
            dollar_string = scraped_data[0]["price"][0].replace("&nbsp;","")
            removed_dollar_sign = dollar_string.replace("$", "")
            try:
                price_as_float = float(removed_dollar_sign)
            except ValueError:
                #If the value gotten isn't convertable to a float, then it
                #most likely is "Product Unavailable" and we need to deal
                #with this case later down.  N/A will be our tell for that.
                price_as_float = "N/A"
            #get the product name by itself.
            product_name = scraped_data[0]["product"][0]

            #now add the sanitized price and product name to price list
            price_list[product_name] = [price_as_float, each_url]

        #Create a json file of the prices
        timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
        with open("/tmp/prices-%s.json" % timestamp, "w") as fp:
            json.dump(price_list, fp, sort_keys=True, indent=4)

        #Compare this price list to the most "recent" price list
        recent_price_list = {}

        with open('/tmp/prices-recent.json', 'r') as fp:
            recent_price_list = json.load(fp)

        #This will be the output data of comparing the old data and new data
        #format: {
        #            "product_one_name":
        #                {
        #                     "old_price": <float>
        #                     "new_price": <float>,
        #                     "new_difference": <float of new price - old price>,
        #                     "is_difference": <boolean>,
        #                     "is_new_product": <boolean>,
        #                     "is_discontinued_product": <boolean>
        #                },
        #            "product_two_name":...
        #
        comparison_data = {}

        for old_product, old_price in recent_price_list.iteritems():
            new_difference = 0.0
            is_difference = False
            is_new_product = False
            is_discontinued_product = False
            try:
                new_price = price_list[old_product]
                new_difference = new_price[0] - old_price[0]
            except(KeyError, TypeError):
                #take care of the case that old_product doesn't appear on price_list
                #This also takes care of the case the the old_price isn't a float because
                #the old price is marked as N/A
                new_price = [0.0]
                is_discontinued_product = True

            if new_difference != 0.0:
                is_difference = True

            comparison_data[old_product] = {
                                            "old_price": old_price[0],
                                            "new_price": new_price[0],
                                            "new_difference": new_difference,
                                            "is_difference": is_difference,
                                            "is_new_product": False,
                                            "is_discontinued_product": is_discontinued_product,
                                            "product_url": old_price[1]
                                        }

        #find all items on price_list that is not in recent_price_list
        new_inventory_set = set(price_list.keys()) - set(recent_price_list.keys())
        new_inventory_list = list(new_inventory_set)

        for each_product in new_inventory_list:
            comparison_data[each_product] = { "old_price": 0.0,
                                              "new_price": price_list[each_product][0],
                                              "new_difference": price_list[each_product][0],
                                              "is_difference": True,
                                              "is_new_product": True,
                                              "is_discontinued_product": False,
                                              "product_url": price_list[each_product][1]
                                        }

        #makes it easy to find the always most recent data
        with open("/tmp/price-comparison-recent.json", "w") as fp:
            json.dump(comparison_data, fp, sort_keys=True, indent=4)

        #update the recent prices
        with open("/tmp/prices-recent.json", "w") as fp:
            json.dump(price_list, fp, sort_keys=True, indent=4)

        #Create a file to be the most recent comparison data
        timestamp = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
        if "True" in comparison_data:
            filename = "/tmp/price-comparison-%s.json"
            with open(filename, "w") as fp:
                json.dump(comparison_data, fp, sort_keys=True, indent=4)
                return filename

        return None
Пример #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/3/3 0:27
# @Author  : tanxw

# pip install scrapely
from scrapely import Scraper
s = Scraper()
train_url = 'http://example.webscraping.com/places/default/view/Afghanistan-1'
s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'})
test_url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
s.scrape(test_url)
from scrapely import Scraper

s = Scraper()

url = ""
data = {}
s.scrape(url, data)
Пример #10
0
from scrapely import Scraper
import sys
import json
try:
	scrape_site=sys.argv[1]
except:
	print 'Invalid arguements. Usage python scrape.py <site-name>'
	sys.exit(2)
print 'Training the scraper with existing data-set'
s=Scraper()
result={}
train_data=json.loads(open(scrape_site+'_train.json','r').read())
for data in train_data:
	s.train( data['url'],{'name':data['title']})
test_data=json.loads(open(scrape_site+'_tests.json','r').read())
for data in test_data:
	result.update(s.scrape(data['url']))
open(scrape_site+'_result.json','w').write(json.dumps(result))

Пример #11
0
def scrapely_test():
    s = Scraper()
    train_url = 'http://example.python-scraping.com/view/Afghanistan-1'
    s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'})
    test_url = 'http://example.python-scraping.com/view/United-Kingdom-239'
    print(s.scrape(test_url))
    'text': '<div id="article">',
    'author': 'Redazione Journey',
    'date': '22 mar 2017'
}
s.train(url1, data)

# file opener
file_wht = open('test.csv', "wb")
writer = csv.writer(file_wht,
                    delimiter=';',
                    quotechar='"',
                    quoting=csv.QUOTE_ALL)
writer.writerow(("Titolo", "Testo", "Autore", "Data"))

# get stuff
for item in urls:
    try:
        content = s.scrape(item)[0]
        title = h.handle(content["title"][0]).encode('utf-8')
        parsed_text = h.handle(content["text"][0]).encode('utf-8')
        author = h.handle(content["author"][0]).encode('utf-8')
        date = h.handle(content["date"][0]).encode('utf-8')

        print "Success!"
        tpl = (title, parsed_text, author, date)
        writer.writerow(tpl)
    except:
        print ":("

file_wht.close()
Пример #13
0
##pp = pprint.Prettyprint(indent=2)
#pprint.pprint(d)
#print d[0]['title'][0]
#print d[0]['category'][0]
#print d[0]['date'][0]

s.train(url1, data)

import os

data = {}

#for dirname, dirnames, filenames in os.walk('../utf8/'):
#    for filename in filenames:

for fn in os.listdir('../utf8/'):
    print fn
    url2 = '../utf8/' + fn
    d = s.scrape(url2)
    try:
        data[fn] = {
                'title': unicode(d[0]['title'][0]),
                'category': unicode(d[0]['category'][0]),
                'date': unicode(d[0]['date'][0])
            }
    except Exception as e:
        print e

open('data.pickle', 'w').write(pickle.dumps(data))

import urllib, scraperwiki
from scrapely import Scraper

s = Scraper()                             # note how we're *not* using Scraper() - this uses our custom version
url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901'
data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'}
s.train(url1,data)
url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876'
print s.scrape(url2)import urllib, scraperwiki
from scrapely import Scraper

s = Scraper()                             # note how we're *not* using Scraper() - this uses our custom version
url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901'
data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'}
s.train(url1,data)
url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876'
print s.scrape(url2)