Пример #1
0
def auto_generate(sampleurl,data,common_url):
    s = Scraper()
    s.train(sampleurl, data)
    res = (s.scrape(common_url))[0]
    for k,v in res.items():
        res[k] = v[0].replace('\n', '').strip()
    return res
Пример #2
0
    def _fallback(self, template, html, source):
        if not self.scrapely_parser:
            self.scrapely_parser = Scraper()

        html = self.scrapely_parser.HtmlPage(body=html)
        db_objct = self.db.read(uri, objct)
        if not db_objct:
            data = db_objct.attrs_to_dict()

            self.scrapely_parser.train_from_htmlpage(html, data)
            attr_dicts = self.scrapely_parser.scrape_page(html)

            for attr_dict in attr_dicts:
                objct = template._replicate(name=template.name, url=source.url)
                # Add the parsed values.
                objct.attrs_from_dict(attr_dict)
                yield objct
        return []
Пример #3
0
    def test_extraction(self):

        samples_encoding = 'latin1'
        [(html1, data1), (html2, data2)] = list(iter_samples(
            'scraper_loadstore', html_encoding=samples_encoding))
        sc = Scraper()
        page1 = HtmlPage(body=html1, encoding=samples_encoding)
        sc.train_from_htmlpage(page1, data1)

        page2 = HtmlPage(body=html2, encoding=samples_encoding)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)

        # check still works after serialize/deserialize 
        f = StringIO()
        sc.tofile(f)
        f.seek(0)
        sc = Scraper.fromfile(f)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)
Пример #4
0
# Google Search Package: https://breakingcode.wordpress.com/2010/06/29/google-search-python/
# Scrapely Package: https://github.com/scrapy/scrapely
# https://www.analyticsvidhya.com/blog/2015/10/beginner-guide-web-scraping-beautiful-soup-python/
# https://stackoverflow.com/questions/3898574/google-search-using-python-script

#imports
import urllib2
from bs4 import BeautifulSoup
from googlesearch.googlesearch import GoogleSearch
import csv
from scrapely import Scraper
from bs4 import UnicodeDammit
from collections import Counter
import re
import time
s = Scraper()


query = raw_input("Search Query: ")
try:
    n = int(raw_input("# of Websites to Scrape: "))
except ValueError:
    print "Enter Valid # of Websites"
    sys.exit()
'''
UNIXtime = int(time.time())
filename = query.replace(" ","_").lower()+"_"+str(n)+"_"+str(UNIXtime)
print filename
'''
# initialize dictionary to store search results
# rows: Name, Author, Description, Url
Пример #5
0
 def __init__(self, threshold=0.75, k=5):
     self.threshold = threshold
     self.k = k
     self.scraper = Scraper()
Пример #6
0
"""
    @author: 挖掘机小王子
    @contact: [email protected]
    @software: PyCharm
    @file: test.py
    @time: 2019/12/6 11:53
    @desc:
"""
from scrapely import Scraper
import requests


scraper = Scraper()

url = 'https://www.ituring.com.cn/article'
data = {'name': 'duxingzhe', 'title': '这两天的面试经验总结'}
# response = requests.get(url).text
scraper.train(url, data)
result = scraper.scrape(url, encoding='utf-8')
print(result)
Пример #7
0
BUS = Namespace("http://purl.org/wikibus/omnibusrevue/")
OR = Namespace("http://purl.org/wikibus/omnibusrevue/bus/")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")

def CreateGraph(busId, busData):
    graph = Graph()
    busRes = OR[busId]
    graph.add((busRes, RDF.type, BUS["Bus"]))
    graph.add((busRes, FOAF["page"], Literal(busUrlFormatWithName.format(busData[0]['model'][0].encode('utf-8'), busId))))
    for key in busData[0]:        
        obj = busData[0][key][0].encode('utf-8')
        if obj <> "k.A":
            graph.add((busRes, BUS[key], Literal(obj)))
    return graph.serialize(format='turtle')

busScraper = Scraper()
busScraper.train(busUrlFormat % '1120301', exampleData)

offset = 0
while True:
    html = scraperwiki.scrape(catalogUrlFormat % offset)
    root = lxml.html.fromstring(html)
    busIds = root.cssselect('input[type=checkbox]')
    if len(busIds) > 0:
        for busCheckbox in busIds:
            busUrl = busUrlFormat % busCheckbox.attrib['value']
            busGraph = CreateGraph(busCheckbox.attrib['value'], busScraper.scrape(busUrl))
            dataStored = {'url': busUrl, 'graph': busGraph}
            scraperwiki.sqlite.save(unique_keys=['url'], data=dataStored)
        offset += 20
    else:
Пример #8
0
def scrapely_test():
    s = Scraper()
    train_url = 'http://example.python-scraping.com/view/Afghanistan-1'
    s.train(train_url, {'name': 'Afghanistan', 'population': '29,121,286'})
    test_url = 'http://example.python-scraping.com/view/United-Kingdom-239'
    print(s.scrape(test_url))
import urllib, scraperwiki
from scrapely import Scraper

s = Scraper()                             # note how we're *not* using Scraper() - this uses our custom version
url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901'
data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'}
s.train(url1,data)
url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876'
print s.scrape(url2)import urllib, scraperwiki
from scrapely import Scraper

s = Scraper()                             # note how we're *not* using Scraper() - this uses our custom version
url1 = 'http://www.thefest.com/store/beatles-ornaments/the-beatles-applique-stocking-p-3901'
data = {'name': 'THE BEATLES APPLIQUE STOCKING', 'category': 'Beatles Ornaments', 'description': 'BRAND NEW- If you are good, maybe Santa will put something special in this poly/cotton applique stocking - He will have to work overtime to fill this! Measures 19" diagonally from upper left facing to the tip of the toe. This is the first Christmas Beatles Stocking ever offered!', 'price': '$20.00', 'catalog number': '7287'}
s.train(url1,data)
url2 = 'http://www.thefest.com/store/beatles-ornaments/yellow-submarines-light-set-p-3876'
print s.scrape(url2)