Пример #1
0
    def test_basic_addXpaths(self):
        scraper = Scraper(debug=True)
        xpaths = [{
            "name": "Basic_Test",
            "raw": "//div",
            "children": [],
            "options": {}
        }, {
            "name":
            "Child_Test",
            "raw":
            "//div",
            "children": [{
                "name": "Basic_Child",
                "raw": "//div",
                "children": [],
                "options": {}
            }],
            "options": {}
        }, {
            "name": "Options_Test",
            "raw": "//div",
            "children": [],
            "options": {
                "text": True
            }
        }]

        scraper.addInstructions(xpaths)

        self.assertDictEqual(scraper._instruction_sets[0].get_init_dict(), {
            "name": "Basic_Test",
            "raw": "//div",
            "children": [],
            "options": {}
        })
        self.assertDictEqual(
            scraper._instruction_sets[1].get_init_dict(), {
                "name":
                "Child_Test",
                "raw":
                "//div",
                "children": [{
                    "name": "Basic_Child",
                    "raw": "//div",
                    "children": [],
                    "options": {}
                }],
                "options": {}
            })
        self.assertDictEqual(
            scraper._instruction_sets[2].get_init_dict(), {
                "name": "Options_Test",
                "raw": "//div",
                "children": [],
                "options": {
                    "text": True
                }
            })
Пример #2
0
class App:
    __FILE_FORMAT = '.mp4'
    __TIMEOUT = config.BLOCKED_TIMEOUT


    def __init__(self, anime_url:str, download_path:str):
        self.__scraper = Scraper(anime_url)
        self.__downloader = Downloader(download_path)

    def download(self, episode:str) -> bool:
        while True:
            try:
                LOGGER.info(f'downloading episode {episode}')
                
                # acquire list of downloadable video urls
                videos = self.__scraper.get(episode)
                break
            except RequestBlocked:
                LOGGER.error(f'request blocked by anime heaven for episode {episode}, going to try again in {self.__TIMEOUT} seconds')
                time.sleep(self.__TIMEOUT)

        if not videos:
            LOGGER.error(f'url not found for episode {episode}')
            return False

        filename = self.__get_filename(episode)
        # NOTE: use first download url only
        todownload = videos[0]
        self.__downloader.download(filename, todownload)        

        LOGGER.info(f'downloaded episode {episode}')
        return True

    def get_downloads(self) -> dict:
        return self.__downloader.get_downloads()

    def __get_filename(self, episode:str) -> str:
        return f'Episode-{episode}{self.__FILE_FORMAT}'
Пример #3
0
from src.scraper import Scraper

Scraper().scrape()
Пример #4
0
from src.scraper import Scraper

if __name__ == '__main__':
    scraper = Scraper()
    scraper.run()
Пример #5
0
 def setUp(self):
   self.scraper = Scraper("IMDB")
   
   with codecs.open("./tests/fixtures/imdb_fight_club_movie_page_2018_09_10_minified.html") as f:
     self.movie_page_str = f.read().replace('\n', '')
     self.movie_page_xml = lxml.html.document_fromstring(self.movie_page_str)
Пример #6
0
class TestScraper(unittest.TestCase):
  
  def setUp(self):
    self.scraper = Scraper("IMDB")
    
    with codecs.open("./tests/fixtures/imdb_fight_club_movie_page_2018_09_10_minified.html") as f:
      self.movie_page_str = f.read().replace('\n', '')
      self.movie_page_xml = lxml.html.document_fromstring(self.movie_page_str)
  
  def tearDown(self):
    pass

  def test_construct_search_url(self):
    search_url = self.scraper.construct_search_url("Fight Club (1999)")
    self.assertEquals("http://www.imdb.com/find?q=fight+club+(1999)&s=all", search_url)
  
  def test_construct_search_url_unicode(self):
    search_url = self.scraper.construct_search_url(u"Amélie (2001)")
    self.assertEquals("http://www.imdb.com/find?q=am\xe9lie+(2001)&s=all", search_url)

  def test_get_title(self):
    self.assertEquals("Fight Club", self.scraper.get_title(self.movie_page_xml))
  
  def test_get_title(self):
    self.assertEquals([], self.scraper.get_alternative_title(self.movie_page_xml))

  def test_get_description(self):
    self.assertEquals(
      'An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.',
      self.scraper.get_description(self.movie_page_xml)
    )

  def test_get_director(self):
    self.assertEquals(['David Fincher'], self.scraper.get_director(self.movie_page_xml))

  def test_rating(self):
    self.assertEquals("8.8", self.scraper.get_rating(self.movie_page_xml))

  def test_get_genres(self):
    self.assertEquals(['Drama'], self.scraper.get_genres(self.movie_page_xml))

  def test_get_votes(self):
    self.assertEquals("1,595,752", self.scraper.get_votes(self.movie_page_xml))

  def test_get_running_time(self):
    self.assertEquals("2h 19min", self.scraper.get_running_time(self.movie_page_xml))

  def test_get_content_rating(self):
    self.assertEquals('R', self.scraper.get_content_rating(self.movie_page_xml))

  def test_get_stars(self):
    self.assertEquals(
      ['Brad Pitt', 'Edward Norton', 'Meat Loaf'],
      self.scraper.get_stars(self.movie_page_xml)
    )

  def test_get_languages(self):
    self.assertEquals(['English'], self.scraper.get_languages(self.movie_page_xml))

  def test_get_image_url(self):
    self.assertEquals(
        'https://m.media-amazon.com/images/M/MV5BMjJmYTNkNmItYjYyZC00MGUxLWJhNWMtZDY4Nzc1MDAwMzU5XkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_UX182_CR0,0,182,268_AL_.jpg',
      self.scraper.get_image_url(self.movie_page_xml)
    )

  def test_get_movie_year(self):
    self.assertEquals('1999', self.scraper.get_movie_year(self.movie_page_xml))

  def test_get_awards(self):
    self.assertEquals('Nominated for 1 Oscar.', self.scraper.get_awards(self.movie_page_xml))
Пример #7
0
 def __init__(self, anime_url:str, download_path:str):
     self.__scraper = Scraper(anime_url)
     self.__downloader = Downloader(download_path)
Пример #8
0
def scrape_yard():
    crawler = AsynchronousCrawler() if settings.isASynchronous else Crawler()
    crawler.start()

    scraper = MultiprocessScraper() if settings.isMultiprocess else Scraper()
    scraper.start()
Пример #9
0
from src.scraper import Scraper
from src.instructions import *
from src.regex_functions import *
import string
from nltk.corpus import stopwords

# measurments_regex_short = re.compile(r'([0-9?\.?\-]+)\s?((?![(pcs)(pc)(pairs)(x)])[a-z\"\']{1,3}(?![a-z]))')
clean_html_regex = re.compile(r"(?<=<\/li>)\s+(<\/li>\s+<\/ul>)")
clean_istr = Instruction("//div[@id='ResultSetItems']", "cleaned_results", backup_xpaths=["//ul[@id='ListViewInner']"], return_html=True)

def clean_html(data):
    from time import time
    start = time()
    fixed = clean_html_regex.sub("</ul>", data)
    end = time()
    test = clean_istr(fixed)[1]
    print("Cleaning html took: " + str(end - start))
    return fixed


if __name__ == "__main__":
    s = Scraper(cores=1, debug=True)
    links = {"url":"https://www.ebay.com/sch/i.html?_from=R40&_nkw=&_sacat=180966&_ipg=25", "dictKey":"door_lock_images", "instructionSet": "getImages", "headers": {}}
    getImage = Instruction("./div[contains(@class,'lvpic')]/*/img","image",attrib={'img': 'src'},debug=True,backup_xpaths=["./div[@class='s-item__wrapper clearfix']/div[@class='s-item__image-section']/div[@class='s-item__image']/a/*/img[@class='s-item__image-img']"])
    getList = Instruction("//div[@class='s-item__wrapper clearfix']","listing",attrib={'link': 'href'},children=[getImage])
    getItems = Instruction("//li[contains(@class,'sresult')]","items",children=[getImage],backup_xpaths=["//ul[@class='srp-results srp-list clearfix']/li[contains(@class,'item')]"],debug=True)
    s.addInstructions([getItems], "getImages")
    # s.run(links+links_2+links_3)
    s.run([links])
Пример #10
0
                                if q in data["quantity"]:
                                    data["quantity"][q] = max(
                                        data["quantity"][q],
                                        second_result[k][q])
                                else:
                                    data["quantity"][q] = second_result[k][q]
                title_shortened = title_shortened.replace(second, " ")
    data["product_ids"] = list(set(product_num.findall(title_shortened)))
    data["title_shortened"] = " ".join(title_shortened.split())

    return data


if __name__ == "__main__":
    # Example main using scraper
    s = Scraper(cores=6, debug=True, apply_functions=[clean_html])

    # links = create_multi_pages("https://www.ebay.com/sch/i.html?_from=R40&_nkw=melamine+sponge&_sacat=0&_ipg=200", 1, "melamine_foam") + create_multi_pages("https://www.ebay.com/sch/i.html?_from=R40&_nkw=ddr4+16gb&_sacat=0&LH_TitleDesc=0&LH_TitleDesc=0&_ipg=200", 1, "ddr4_16gb") + create_multi_pages("https://www.ebay.com/sch/i.html?_from=R40&_nkw=yeezy&_sacat=0&LH_TitleDesc=0&LH_TitleDesc=0&_dmd=1&rt=nc&_ipg=200", 1, "yeezy")
    links = [{
        "url":
        "https://www.ebay.com/sch/i.html?_from=R40&_nkw=melamine+foam&_sacat=0&LH_TitleDesc=0&_blrs=spell_check&LH_TitleDesc=0&_ipg=200",
        "dictKey": "melamine_foam",
        "instructionSet": "getListings",
        "headers": {}
    }, {
        "url":
        "https://www.ebay.com/sch/i.html?_from=R40&_nkw=ddr4+16gb&_sacat=0&LH_TitleDesc=0&LH_TitleDesc=0&_ipg=200",
        "dictKey": "ddr4_16gb",
        "instructionSet": "getListings",
        "headers": {}
    }, {
Пример #11
0
def ex_apply_function(data):
    # Example apply function
    price = data["price"]
    price = price.replace('\t', "").replace("\n", "")
    price = price.split(" to ")[0].replace("$", "")
    price = price.split("Trending")[0].replace("$", "")
    try:
        data["price"] = float(price)
    except:
        data["price"] = float(price.replace(",", ""))
    return data


if __name__ == "__main__":
    # Example main using scraper
    s = Scraper(cores=4, debug=True)
    import ast

    links = [
             {
                 'url'    : 'https://www.ebay.com/sch/i.html?_odkw=blank&_osacat=0&_ipg=200&_from=R40&_trksid=m570.l1313&_nkw=blank&_sacat=0',
                 'headers': {}, 'dictKey': 'blank_test', "instructionSet":"getListings"
             },
             {
                 'url'    : 'https://www.ebay.com/sch/i.html?_odkw=starve&_osacat=0&_ipg=200&_from=R40&_trksid=m570.l1313&_nkw=starve&_sacat=0',
                 'headers': {}, 'dictKey': 'starve_test', "instructionSet":"getListings"
             }
             ]

    # Using the function to generate the instruction dictionary
    listing_title = generate_instruction_dict("./h3[@class='lvtitle']/a", "title", text=True, backup_xpaths=[".//h3[@class='s-item__title']"])
from src.scraper import Scraper
from src.data import Transformer
import os
from src.data import DatabaseWriter
from src.predictor import Predictor

wd = os.getcwd() + "/"
escreiper = Scraper(wd)
scraped_data_file_path = escreiper.scrape()
transformer = Transformer(scraped_data_file_path)
transformer.write()

dw = DatabaseWriter(wd + "../urgency_predictor_data/data.sqlite", "real",
                    wd + "real_data.csv")
dw.write()

wd = os.getcwd() + "/"
p = Predictor(wd + "../urgency_predictor_data/data.sqlite")
p.fit()
p.predict(60)
p.write()
Пример #13
0
import os
import sys

if r'\examples' in os.getcwd():
    sys.path.insert(0, os.path.normpath(os.getcwd() + os.sep + os.pardir))
    os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir))

from src.scraper import Scraper
from src.instructions import generate_instruction_dict


if __name__ == "__main__":
    # Example main using scraper
    s = Scraper(cores=4,debug=True)
    import ast
    links = [{'url': 'https://www.ebay.com/v/allcategories',
              'headers': {}, 'dictKey': 'categories', "instructionSet":"getCategories"}]

    # Using the function to generate the instruction dictionary
    sub_section_l2 = generate_instruction_dict(".//li[@class='sub-category']/a[@class='categories-with-links']", "l2_subsection", text=True,
                                               etree_text=True,attrib={"link":"href"})
    sub_section_l1 = generate_instruction_dict(".//div[@class='l1-name-wrapper']/a[@class='l1-name categories-with-links']/h3", "l1_subsection",text=True)

    sections = generate_instruction_dict("//div[@class='category-section']", "section", children=[sub_section_l1,sub_section_l2],attrib={'section':
                                                                                                                                             'data-id'})

    instructions = [sections]

    # Running the scraper
    s.addInstructions(instructions,"getCategories")
    s.run(links)
Пример #14
0
import os
import sys

if r'\examples' in os.getcwd():
    sys.path.insert(0, os.path.normpath(os.getcwd() + os.sep + os.pardir))
    os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir))

from src.scraper import Scraper
from src.instructions import *
from src.regex_functions import *
import string
from nltk.corpus import stopwords

if __name__ == "__main__":
    # Example main using scraper
    s = Scraper(cores=1, debug=True)

    links = [{
        "url":
        "https://www.baseball-reference.com/boxes/ARI/ARI201704020.shtml",
        "instructionSet": "getPlayerStats",
        "headers": {},
        "dictKey": "playerStats"
    }]

    getTable = Instruction("//div[@class='media-item logo']",
                           "playerTable",
                           return_html=True,
                           debug=True)

    # Running the scraper
    num_results = data["listings"]["num_results"]
    total_pages = total // num_results
    rtr = []
    for i in range(2, total_pages + 1):
        rtr.append({
            "dictKey": key,
            "url": url + "&_pgn={}".format(i),
            "headers": {},
            "instructionSet": "getListings"
        })
    return [key, data, instruction_set], rtr


if __name__ == "__main__":
    # Example main using scraper
    s = Scraper(cores=2, debug=True, continuous_adding=True)
    links = [{
        'url':
        'https://www.ebay.com/sch/i.html?_odkw=superlunary&_osacat=0&_ipg=200&_from=R40&_trksid=m570.l1313&_nkw=superlunary&_sacat=0',
        'headers': {},
        'dictKey': 'superlunary_test',
        "instructionSet": "getListings"
    }]
    rules = {"getListings": {"apply": ex_function, "keyApply": "_depth"}}
    stop_cond = {"dictKey": "superlunary_test"}
    s.continuous_params(rules, stop_cond)
    listing_title = generate_instruction_dict(".//h3[@class='lvtitle']/a",
                                              "title",
                                              text=True)
    listings = generate_instruction_dict(
        "//li[@class='sresult lvresult clearfix li shic']",
Пример #16
0
import time
import json
import os
from src.scraper import Scraper
from src.coindesk import CoinDeskMainPage
from redis import Redis
from kafka import KafkaProducer
from kafka.errors import NoBrokersAvailable

if __name__ == '__main__':
    redis = Redis(os.environ.get('REDIS_HOST', 'localhost'), db=0)
    while True:
        try:
            producer = KafkaProducer(
                bootstrap_servers=os.environ.get('KAFKA_HOST', '0.0.0.0') +
                ':9092',
                value_serializer=lambda v: v.encode('utf-8'))
            break
        except NoBrokersAvailable as e:
            print("Could not connect to kafka trying again in 5sec")
            time.sleep(5)

    scrapers = [
        Scraper(CoinDeskMainPage(), redis=redis, kafka_producer=producer)
    ]

    while True:
        for scraper in scrapers:
            scraper.run()
        time.sleep(100)
Пример #17
0
"""Script entry point."""
from src.scraper import Scraper
from config import URLS

Scraper().scrape(URLS)
Пример #18
0
from src.boy import Boy
from datetime import date
from dotenv import load_dotenv

load_dotenv()
token = os.getenv('DISCORD_TOKEN')
server_name = os.getenv('SERVER_NAME')

intents = discord.Intents.default()
intents.members = True

#client = discord.Client(intents=intents)

bot = commands.Bot(command_prefix='!', intents=intents)

scraper = Scraper()
botd = Boy()

#print(client.user)


@bot.event
async def on_ready():
    server = bot.guilds[0]
    if server.name == server_name:
        print(f'{bot.user} has connected to {server.name}, id: {server.id}')

    print([name.name for name in server.members])


"""
Пример #19
0
from src.scraper import Scraper

scraper = Scraper()
scraper.getPCodeList(18)
data = scraper.getRVList()