def scrape_Rss_Links(links, scrapedata): global scraper for link in links: value = link.text print(value) scraper_obj = scraper(value, scrapedata) # print(scrapedata) if(scraper_obj.db.articleExist(value)): # print(scraper_obj.get_data()) continue else: scraper_obj.get_data() break
# https: // www.calcalist.co.il / Ext / Comp / Allday / CdaStockNews_Xml / 0, 15250, L - 604611 - 1 - 0 - 71 - 353, 0 # history_url = "https://www.calcalist.co.il/stocks/home/0,7340,L-3959-"+instrumentId+"--4,00.html" document = scraper.get_doc(scraper, history_xml) doc = json.loads(document) # print(doc['Total']) total_at = int(doc['Total']) pagesize = 50 take = 50 # skip = 0 total_pages = math.ceil(total_at / pagesize) # print(total_pages) for i in range(0, int(total_pages)): skip = take * i start_document = json.loads( scraper.get_doc( scraper, "https://www.bizportal.co.il/capitalmarket/quote/AjaxRequests/SectorNews_Ajax?paperId=%s&take=%s&skip=%s&page=%s&pageSize=%s" % (instrumentId, take, skip, i + 1, pagesize))) # items = start_document['Data'] # # print(items) if (items): for item in items: link = item['ArticleLink'] # print(link) scrape = scraper(link, globes_config) scrape.get_data() # #
from pdbmodel import PdbModel from scrapermodel import scraper config_map = { 'www.themarker.com': 0, 'www.bizportal.co.il': 3, 'www.calcalist.co.il': 2, 'www.globes.co.il': 1 } sql = """SELECT website, url FROM scrapedata WHERE created_at::date <= NOW() - INTERVAL '48 HOURS' AND ck_feedback = '0' ORDER BY created_at DESC""" db = PdbModel('atscraper') results = db.fetchall(sql) # Get scraperconfig for this site xmlparser = Xmlparser() for result in results: get_config = xmlparser.get_scraper_config('config.xml') config = xmlparser.sort_scraper_data(get_config[config_map[result[0]]]) if 'feedbacks' in config['scrape']: scrape = scraper(result[1], config) count = scrape.get_feedback_data() db.updateFeedback(count) print(scrape.get_feedback_data()) # print(result)
# get all Instrumement id sql = 'Select * from instrumentids' allInstrumenIDs = db.fetchall(sql) for insID in allInstrumenIDs: instrumentId = insID[2] # print(insID[1]) history_xml = "https://www.calcalist.co.il/Ext/Comp/Allday/CdaStockNews_Xml/0,15250,L-" + instrumentId + "-1-0-1-249,00.xml" # https: // www.calcalist.co.il / Ext / Comp / Allday / CdaStockNews_Xml / 0, 15250, L - 604611 - 1 - 0 - 71 - 353, 0 # history_url = "https://www.calcalist.co.il/stocks/home/0,7340,L-3959-"+instrumentId+"--4,00.html" document = scraper.get_doc(scraper, history_xml) doc = xmlparser.get_any_xml(document) total_at = int(xmlparser.get_xml_find(doc, 'total').text) total_pages = math.ceil(total_at / 30) print(total_pages) for i in range(1, int(total_pages)): start_document = xmlparser.get_any_xml( scraper.get_doc( scraper, "https://www.calcalist.co.il/Ext/Comp/Allday/CdaStockNews_Xml/0,15250,L-" + instrumentId + "-1-0-" + str(i) + "-249,00.xml")) items = xmlparser.get_xml_find_all(start_document, "Item/link") # print(items) if (items): for item in items: link = "https://www.calcalist.co.il" + item.text print(link) scrape = scraper("https://www.calcalist.co.il" + item.text, globes_config) scrape.get_data()
globes_config = xmlparser.sort_scraper_data(get_config[1]) # print(globes_config) # get all Instrumement id sql = 'Select * from instrumentids' allInstrumenIDs = db.fetchall(sql) for insID in allInstrumenIDs : instrumentId =insID[1] # print(insID[1]) history_url = "https://www.globes.co.il/portal/instrument.aspx?instrumentid="+instrumentId+"&mode=news" document = scraper.get_doc(scraper,history_url) article_links = scraper.get_history_links(scraper, document, 'self.soup.select(".mainArticletitle > a")') for at_link in article_links: # print(at_link['href']) if(at_link): scrape = scraper("https://www.globes.co.il"+at_link['href'], globes_config) scrape.get_data() # get_all_work = xmlparser.get_scraper_config('config.xml') # # for scraperdata in get_all_work : # config_scraper_data = xmlparser.sort_scraper_data(scraperdata) # if(config_scraper_data['history']): # scrapedata = config_scraper_data['scrape'] # rss_link = config_scraper_data['link'] # rss_xml = xmlparser.get_rss(rss_link)