Python scraper示例，scrapermodel.scraper Python示例

示例#1

0

显示文件

def scrape_Rss_Links(links, scrapedata):
    global scraper
    for link in links:
        value = link.text
        print(value)
        scraper_obj = scraper(value, scrapedata)
        # print(scrapedata)
        if(scraper_obj.db.articleExist(value)):
            # print(scraper_obj.get_data())
            continue
        else:
            scraper_obj.get_data()
        break

示例#2

0

显示文件

文件： bizportal_history.py 项目： toluwanicareer/pythonRssScraper

    # https: // www.calcalist.co.il / Ext / Comp / Allday / CdaStockNews_Xml / 0, 15250, L - 604611 - 1 - 0 - 71 - 353, 0
    # history_url = "https://www.calcalist.co.il/stocks/home/0,7340,L-3959-"+instrumentId+"--4,00.html"
    document = scraper.get_doc(scraper, history_xml)
    doc = json.loads(document)
    # print(doc['Total'])
    total_at = int(doc['Total'])
    pagesize = 50
    take = 50
    # skip = 0
    total_pages = math.ceil(total_at / pagesize)
    # print(total_pages)
    for i in range(0, int(total_pages)):
        skip = take * i
        start_document = json.loads(
            scraper.get_doc(
                scraper,
                "https://www.bizportal.co.il/capitalmarket/quote/AjaxRequests/SectorNews_Ajax?paperId=%s&take=%s&skip=%s&page=%s&pageSize=%s"
                % (instrumentId, take, skip, i + 1, pagesize)))

        #
        items = start_document['Data']
        #     # print(items)
        if (items):
            for item in items:
                link = item['ArticleLink']
                # print(link)
                scrape = scraper(link, globes_config)
                scrape.get_data()
    #
    #

示例#3

0

显示文件

from pdbmodel import PdbModel
from scrapermodel import scraper

config_map = {
    'www.themarker.com': 0,
    'www.bizportal.co.il': 3,
    'www.calcalist.co.il': 2,
    'www.globes.co.il': 1
}

sql = """SELECT website, url
FROM scrapedata
WHERE created_at::date <= NOW() - INTERVAL '48 HOURS'
AND ck_feedback = '0'
ORDER BY created_at DESC"""

db = PdbModel('atscraper')
results = db.fetchall(sql)
# Get scraperconfig for this site
xmlparser = Xmlparser()
for result in results:
    get_config = xmlparser.get_scraper_config('config.xml')
    config = xmlparser.sort_scraper_data(get_config[config_map[result[0]]])
    if 'feedbacks' in config['scrape']:
        scrape = scraper(result[1], config)
        count = scrape.get_feedback_data()
        db.updateFeedback(count)
        print(scrape.get_feedback_data())

# print(result)

示例#4

0

显示文件

文件： catalist_history.py 项目： toluwanicareer/pythonRssScraper

# get all Instrumement id
sql = 'Select * from instrumentids'
allInstrumenIDs = db.fetchall(sql)
for insID in allInstrumenIDs:
    instrumentId = insID[2]
    # print(insID[1])
    history_xml = "https://www.calcalist.co.il/Ext/Comp/Allday/CdaStockNews_Xml/0,15250,L-" + instrumentId + "-1-0-1-249,00.xml"
    # https: // www.calcalist.co.il / Ext / Comp / Allday / CdaStockNews_Xml / 0, 15250, L - 604611 - 1 - 0 - 71 - 353, 0
    # history_url = "https://www.calcalist.co.il/stocks/home/0,7340,L-3959-"+instrumentId+"--4,00.html"
    document = scraper.get_doc(scraper, history_xml)
    doc = xmlparser.get_any_xml(document)
    total_at = int(xmlparser.get_xml_find(doc, 'total').text)
    total_pages = math.ceil(total_at / 30)
    print(total_pages)
    for i in range(1, int(total_pages)):
        start_document = xmlparser.get_any_xml(
            scraper.get_doc(
                scraper,
                "https://www.calcalist.co.il/Ext/Comp/Allday/CdaStockNews_Xml/0,15250,L-"
                + instrumentId + "-1-0-" + str(i) + "-249,00.xml"))

        items = xmlparser.get_xml_find_all(start_document, "Item/link")
        # print(items)
        if (items):
            for item in items:
                link = "https://www.calcalist.co.il" + item.text
                print(link)
                scrape = scraper("https://www.calcalist.co.il" + item.text,
                                 globes_config)
                scrape.get_data()

示例#5

0

显示文件

文件： globes_history.py 项目： toluwanicareer/pythonRssScraper

globes_config = xmlparser.sort_scraper_data(get_config[1])
# print(globes_config)

# get all Instrumement id
sql = 'Select * from instrumentids'
allInstrumenIDs = db.fetchall(sql)
for insID in allInstrumenIDs :
    instrumentId =insID[1]
    # print(insID[1])
    history_url = "https://www.globes.co.il/portal/instrument.aspx?instrumentid="+instrumentId+"&mode=news"
    document = scraper.get_doc(scraper,history_url)
    article_links = scraper.get_history_links(scraper, document, 'self.soup.select(".mainArticletitle > a")')
    for at_link in article_links:
        # print(at_link['href'])
        if(at_link):
            scrape = scraper("https://www.globes.co.il"+at_link['href'], globes_config)
            scrape.get_data()






# get_all_work = xmlparser.get_scraper_config('config.xml')
#
# for scraperdata in get_all_work :
#     config_scraper_data = xmlparser.sort_scraper_data(scraperdata)
#     if(config_scraper_data['history']):
#         scrapedata = config_scraper_data['scrape']
#         rss_link = config_scraper_data['link']
#         rss_xml = xmlparser.get_rss(rss_link)