def test_basic_addXpaths(self): scraper = Scraper(debug=True) xpaths = [{ "name": "Basic_Test", "raw": "//div", "children": [], "options": {} }, { "name": "Child_Test", "raw": "//div", "children": [{ "name": "Basic_Child", "raw": "//div", "children": [], "options": {} }], "options": {} }, { "name": "Options_Test", "raw": "//div", "children": [], "options": { "text": True } }] scraper.addInstructions(xpaths) self.assertDictEqual(scraper._instruction_sets[0].get_init_dict(), { "name": "Basic_Test", "raw": "//div", "children": [], "options": {} }) self.assertDictEqual( scraper._instruction_sets[1].get_init_dict(), { "name": "Child_Test", "raw": "//div", "children": [{ "name": "Basic_Child", "raw": "//div", "children": [], "options": {} }], "options": {} }) self.assertDictEqual( scraper._instruction_sets[2].get_init_dict(), { "name": "Options_Test", "raw": "//div", "children": [], "options": { "text": True } })
class App: __FILE_FORMAT = '.mp4' __TIMEOUT = config.BLOCKED_TIMEOUT def __init__(self, anime_url:str, download_path:str): self.__scraper = Scraper(anime_url) self.__downloader = Downloader(download_path) def download(self, episode:str) -> bool: while True: try: LOGGER.info(f'downloading episode {episode}') # acquire list of downloadable video urls videos = self.__scraper.get(episode) break except RequestBlocked: LOGGER.error(f'request blocked by anime heaven for episode {episode}, going to try again in {self.__TIMEOUT} seconds') time.sleep(self.__TIMEOUT) if not videos: LOGGER.error(f'url not found for episode {episode}') return False filename = self.__get_filename(episode) # NOTE: use first download url only todownload = videos[0] self.__downloader.download(filename, todownload) LOGGER.info(f'downloaded episode {episode}') return True def get_downloads(self) -> dict: return self.__downloader.get_downloads() def __get_filename(self, episode:str) -> str: return f'Episode-{episode}{self.__FILE_FORMAT}'
from src.scraper import Scraper Scraper().scrape()
from src.scraper import Scraper if __name__ == '__main__': scraper = Scraper() scraper.run()
def setUp(self): self.scraper = Scraper("IMDB") with codecs.open("./tests/fixtures/imdb_fight_club_movie_page_2018_09_10_minified.html") as f: self.movie_page_str = f.read().replace('\n', '') self.movie_page_xml = lxml.html.document_fromstring(self.movie_page_str)
class TestScraper(unittest.TestCase): def setUp(self): self.scraper = Scraper("IMDB") with codecs.open("./tests/fixtures/imdb_fight_club_movie_page_2018_09_10_minified.html") as f: self.movie_page_str = f.read().replace('\n', '') self.movie_page_xml = lxml.html.document_fromstring(self.movie_page_str) def tearDown(self): pass def test_construct_search_url(self): search_url = self.scraper.construct_search_url("Fight Club (1999)") self.assertEquals("http://www.imdb.com/find?q=fight+club+(1999)&s=all", search_url) def test_construct_search_url_unicode(self): search_url = self.scraper.construct_search_url(u"Amélie (2001)") self.assertEquals("http://www.imdb.com/find?q=am\xe9lie+(2001)&s=all", search_url) def test_get_title(self): self.assertEquals("Fight Club", self.scraper.get_title(self.movie_page_xml)) def test_get_title(self): self.assertEquals([], self.scraper.get_alternative_title(self.movie_page_xml)) def test_get_description(self): self.assertEquals( 'An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.', self.scraper.get_description(self.movie_page_xml) ) def test_get_director(self): self.assertEquals(['David Fincher'], self.scraper.get_director(self.movie_page_xml)) def test_rating(self): self.assertEquals("8.8", self.scraper.get_rating(self.movie_page_xml)) def test_get_genres(self): self.assertEquals(['Drama'], self.scraper.get_genres(self.movie_page_xml)) def test_get_votes(self): self.assertEquals("1,595,752", self.scraper.get_votes(self.movie_page_xml)) def test_get_running_time(self): self.assertEquals("2h 19min", self.scraper.get_running_time(self.movie_page_xml)) def test_get_content_rating(self): self.assertEquals('R', self.scraper.get_content_rating(self.movie_page_xml)) def test_get_stars(self): self.assertEquals( ['Brad Pitt', 'Edward Norton', 'Meat Loaf'], self.scraper.get_stars(self.movie_page_xml) ) def test_get_languages(self): self.assertEquals(['English'], self.scraper.get_languages(self.movie_page_xml)) def test_get_image_url(self): self.assertEquals( 'https://m.media-amazon.com/images/M/MV5BMjJmYTNkNmItYjYyZC00MGUxLWJhNWMtZDY4Nzc1MDAwMzU5XkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_UX182_CR0,0,182,268_AL_.jpg', self.scraper.get_image_url(self.movie_page_xml) ) def test_get_movie_year(self): self.assertEquals('1999', self.scraper.get_movie_year(self.movie_page_xml)) def test_get_awards(self): self.assertEquals('Nominated for 1 Oscar.', self.scraper.get_awards(self.movie_page_xml))
def __init__(self, anime_url:str, download_path:str): self.__scraper = Scraper(anime_url) self.__downloader = Downloader(download_path)
def scrape_yard(): crawler = AsynchronousCrawler() if settings.isASynchronous else Crawler() crawler.start() scraper = MultiprocessScraper() if settings.isMultiprocess else Scraper() scraper.start()
from src.scraper import Scraper from src.instructions import * from src.regex_functions import * import string from nltk.corpus import stopwords # measurments_regex_short = re.compile(r'([0-9?\.?\-]+)\s?((?![(pcs)(pc)(pairs)(x)])[a-z\"\']{1,3}(?![a-z]))') clean_html_regex = re.compile(r"(?<=<\/li>)\s+(<\/li>\s+<\/ul>)") clean_istr = Instruction("//div[@id='ResultSetItems']", "cleaned_results", backup_xpaths=["//ul[@id='ListViewInner']"], return_html=True) def clean_html(data): from time import time start = time() fixed = clean_html_regex.sub("</ul>", data) end = time() test = clean_istr(fixed)[1] print("Cleaning html took: " + str(end - start)) return fixed if __name__ == "__main__": s = Scraper(cores=1, debug=True) links = {"url":"https://www.ebay.com/sch/i.html?_from=R40&_nkw=&_sacat=180966&_ipg=25", "dictKey":"door_lock_images", "instructionSet": "getImages", "headers": {}} getImage = Instruction("./div[contains(@class,'lvpic')]/*/img","image",attrib={'img': 'src'},debug=True,backup_xpaths=["./div[@class='s-item__wrapper clearfix']/div[@class='s-item__image-section']/div[@class='s-item__image']/a/*/img[@class='s-item__image-img']"]) getList = Instruction("//div[@class='s-item__wrapper clearfix']","listing",attrib={'link': 'href'},children=[getImage]) getItems = Instruction("//li[contains(@class,'sresult')]","items",children=[getImage],backup_xpaths=["//ul[@class='srp-results srp-list clearfix']/li[contains(@class,'item')]"],debug=True) s.addInstructions([getItems], "getImages") # s.run(links+links_2+links_3) s.run([links])
if q in data["quantity"]: data["quantity"][q] = max( data["quantity"][q], second_result[k][q]) else: data["quantity"][q] = second_result[k][q] title_shortened = title_shortened.replace(second, " ") data["product_ids"] = list(set(product_num.findall(title_shortened))) data["title_shortened"] = " ".join(title_shortened.split()) return data if __name__ == "__main__": # Example main using scraper s = Scraper(cores=6, debug=True, apply_functions=[clean_html]) # links = create_multi_pages("https://www.ebay.com/sch/i.html?_from=R40&_nkw=melamine+sponge&_sacat=0&_ipg=200", 1, "melamine_foam") + create_multi_pages("https://www.ebay.com/sch/i.html?_from=R40&_nkw=ddr4+16gb&_sacat=0&LH_TitleDesc=0&LH_TitleDesc=0&_ipg=200", 1, "ddr4_16gb") + create_multi_pages("https://www.ebay.com/sch/i.html?_from=R40&_nkw=yeezy&_sacat=0&LH_TitleDesc=0&LH_TitleDesc=0&_dmd=1&rt=nc&_ipg=200", 1, "yeezy") links = [{ "url": "https://www.ebay.com/sch/i.html?_from=R40&_nkw=melamine+foam&_sacat=0&LH_TitleDesc=0&_blrs=spell_check&LH_TitleDesc=0&_ipg=200", "dictKey": "melamine_foam", "instructionSet": "getListings", "headers": {} }, { "url": "https://www.ebay.com/sch/i.html?_from=R40&_nkw=ddr4+16gb&_sacat=0&LH_TitleDesc=0&LH_TitleDesc=0&_ipg=200", "dictKey": "ddr4_16gb", "instructionSet": "getListings", "headers": {} }, {
def ex_apply_function(data): # Example apply function price = data["price"] price = price.replace('\t', "").replace("\n", "") price = price.split(" to ")[0].replace("$", "") price = price.split("Trending")[0].replace("$", "") try: data["price"] = float(price) except: data["price"] = float(price.replace(",", "")) return data if __name__ == "__main__": # Example main using scraper s = Scraper(cores=4, debug=True) import ast links = [ { 'url' : 'https://www.ebay.com/sch/i.html?_odkw=blank&_osacat=0&_ipg=200&_from=R40&_trksid=m570.l1313&_nkw=blank&_sacat=0', 'headers': {}, 'dictKey': 'blank_test', "instructionSet":"getListings" }, { 'url' : 'https://www.ebay.com/sch/i.html?_odkw=starve&_osacat=0&_ipg=200&_from=R40&_trksid=m570.l1313&_nkw=starve&_sacat=0', 'headers': {}, 'dictKey': 'starve_test', "instructionSet":"getListings" } ] # Using the function to generate the instruction dictionary listing_title = generate_instruction_dict("./h3[@class='lvtitle']/a", "title", text=True, backup_xpaths=[".//h3[@class='s-item__title']"])
from src.scraper import Scraper from src.data import Transformer import os from src.data import DatabaseWriter from src.predictor import Predictor wd = os.getcwd() + "/" escreiper = Scraper(wd) scraped_data_file_path = escreiper.scrape() transformer = Transformer(scraped_data_file_path) transformer.write() dw = DatabaseWriter(wd + "../urgency_predictor_data/data.sqlite", "real", wd + "real_data.csv") dw.write() wd = os.getcwd() + "/" p = Predictor(wd + "../urgency_predictor_data/data.sqlite") p.fit() p.predict(60) p.write()
import os import sys if r'\examples' in os.getcwd(): sys.path.insert(0, os.path.normpath(os.getcwd() + os.sep + os.pardir)) os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir)) from src.scraper import Scraper from src.instructions import generate_instruction_dict if __name__ == "__main__": # Example main using scraper s = Scraper(cores=4,debug=True) import ast links = [{'url': 'https://www.ebay.com/v/allcategories', 'headers': {}, 'dictKey': 'categories', "instructionSet":"getCategories"}] # Using the function to generate the instruction dictionary sub_section_l2 = generate_instruction_dict(".//li[@class='sub-category']/a[@class='categories-with-links']", "l2_subsection", text=True, etree_text=True,attrib={"link":"href"}) sub_section_l1 = generate_instruction_dict(".//div[@class='l1-name-wrapper']/a[@class='l1-name categories-with-links']/h3", "l1_subsection",text=True) sections = generate_instruction_dict("//div[@class='category-section']", "section", children=[sub_section_l1,sub_section_l2],attrib={'section': 'data-id'}) instructions = [sections] # Running the scraper s.addInstructions(instructions,"getCategories") s.run(links)
import os import sys if r'\examples' in os.getcwd(): sys.path.insert(0, os.path.normpath(os.getcwd() + os.sep + os.pardir)) os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir)) from src.scraper import Scraper from src.instructions import * from src.regex_functions import * import string from nltk.corpus import stopwords if __name__ == "__main__": # Example main using scraper s = Scraper(cores=1, debug=True) links = [{ "url": "https://www.baseball-reference.com/boxes/ARI/ARI201704020.shtml", "instructionSet": "getPlayerStats", "headers": {}, "dictKey": "playerStats" }] getTable = Instruction("//div[@class='media-item logo']", "playerTable", return_html=True, debug=True) # Running the scraper
num_results = data["listings"]["num_results"] total_pages = total // num_results rtr = [] for i in range(2, total_pages + 1): rtr.append({ "dictKey": key, "url": url + "&_pgn={}".format(i), "headers": {}, "instructionSet": "getListings" }) return [key, data, instruction_set], rtr if __name__ == "__main__": # Example main using scraper s = Scraper(cores=2, debug=True, continuous_adding=True) links = [{ 'url': 'https://www.ebay.com/sch/i.html?_odkw=superlunary&_osacat=0&_ipg=200&_from=R40&_trksid=m570.l1313&_nkw=superlunary&_sacat=0', 'headers': {}, 'dictKey': 'superlunary_test', "instructionSet": "getListings" }] rules = {"getListings": {"apply": ex_function, "keyApply": "_depth"}} stop_cond = {"dictKey": "superlunary_test"} s.continuous_params(rules, stop_cond) listing_title = generate_instruction_dict(".//h3[@class='lvtitle']/a", "title", text=True) listings = generate_instruction_dict( "//li[@class='sresult lvresult clearfix li shic']",
import time import json import os from src.scraper import Scraper from src.coindesk import CoinDeskMainPage from redis import Redis from kafka import KafkaProducer from kafka.errors import NoBrokersAvailable if __name__ == '__main__': redis = Redis(os.environ.get('REDIS_HOST', 'localhost'), db=0) while True: try: producer = KafkaProducer( bootstrap_servers=os.environ.get('KAFKA_HOST', '0.0.0.0') + ':9092', value_serializer=lambda v: v.encode('utf-8')) break except NoBrokersAvailable as e: print("Could not connect to kafka trying again in 5sec") time.sleep(5) scrapers = [ Scraper(CoinDeskMainPage(), redis=redis, kafka_producer=producer) ] while True: for scraper in scrapers: scraper.run() time.sleep(100)
"""Script entry point.""" from src.scraper import Scraper from config import URLS Scraper().scrape(URLS)
from src.boy import Boy from datetime import date from dotenv import load_dotenv load_dotenv() token = os.getenv('DISCORD_TOKEN') server_name = os.getenv('SERVER_NAME') intents = discord.Intents.default() intents.members = True #client = discord.Client(intents=intents) bot = commands.Bot(command_prefix='!', intents=intents) scraper = Scraper() botd = Boy() #print(client.user) @bot.event async def on_ready(): server = bot.guilds[0] if server.name == server_name: print(f'{bot.user} has connected to {server.name}, id: {server.id}') print([name.name for name in server.members]) """
from src.scraper import Scraper scraper = Scraper() scraper.getPCodeList(18) data = scraper.getRVList()