예제 #1
0
def obtain(url, j):
    res = req.get(url)
    tree = lxml.html.fromstring(res.text)
    id_date = css('.auto-style5:nth-child(1)')
    id_g_num = css('.auto-style5:nth-child(2)')
    id_s_num = css('.auto-style5:nth-child(3)')
    date_list = id_date(tree)
    g_num_list = id_g_num(tree)
    s_num_list = id_s_num(tree)
    if j == 1:
        g_num_temp = [g_num_list[i].text for i in range(1, len(g_num_list))]

        def split(txt):
            u0 = re.split('\xa0,', txt)
            u0 = re.split(',', txt)
            u1 = np.array(u0).astype(int)
            return u1

        g_num = np.array(
            [split(g_num_temp[i]) for i in range(len(g_num_temp))])
        dataset = g_num
    elif j == 2:
        dataset = np.matrix([
            s_num_list[i].text for i in range(1, len(s_num_list))
        ]).astype(int).transpose()
    else:
        dataset = np.matrix(
            [date_list[i].text for i in range(1, len(date_list))]).transpose()
    return dataset
 def test_to_xml_multi_form(self):
     data = dict(self.make_mgmt_data(2).items() + self.make_sub_form_data(0).items() + self.make_sub_form_data(1).items())
     form = Form(self.uih, data, prefix=self.prefix)
     self.assertTrue(form.is_valid())
     root = create_root('root')
     form.to_xml(root)
     self.assertTrue(css('skymaker[id="6"]')(root))
     self.assertTrue(css('skymaker module-version')(root))
     self.assertTrue(css('skymaker module-version')(root))
     self.assertTrue(css('skymaker parents item')(root))
     self.assertEqual(len(css('skymaker images item')(root)), 2)
예제 #3
0
def get_els(page_html, filters, selector='.hdrlnk'):
  ''' yields (to_crawl, skip) '''
  for el in css(selector)(html.fromstring(page_html)):
    if any((i.upper() in el.text.upper() for i in filters)):
      yield el, None
    else:
      yield None, el
예제 #4
0
def selector(s):
    if s.startswith('css:'):
        return css(s[4:]).path
    elif s.startswith('xpath:'):
        return s[6:]
    else:
        return s
예제 #5
0
def parse_summary_page(html):
    doc = etree.HTML(html)
    li_elms = css("li.j_thread_list")(doc)

    base_url = "http://tieba.baidu.com%s"
    link_items = []
    for li in li_elms:
        link = css("a.j_th_tit")(li)[0]
        href = base_url % link.get('href')
        reply_str = css("span.j_reply_data")(li)[0].text
        link_items.append(PageItem(href, reply_str))

    next_page_links = css("a.next")(doc)
    if next_page_links:
        next_link = base_url % next_page_links[0].get("href")
    else:
        next_link = None

    return (next_link, link_items)
예제 #6
0
def scrape_organization_info(url):
    doc = get_document(url)
    org = {}
    try:
        org['name'] = css("#voicearea h1 span")(doc)[0].text
        for link in css("#voicearea div.contactinfo a[href]")(doc):
            ref = link.get('href')
            if ref.startswith('mailto:'):
                org['mailto'] = ref.replace('mailto:', '').strip()
            elif ref.startswith('http'):
                if not "eniro.se/" in ref and not "hitta.se/" in ref:
                    org['url'] = ref
            else:
                pass #print " # unexpected href: %r" % ref
        return org
    except:
        print "ERROR in content from <%s>:" % url
        print etree.tostring(doc)
        raise
예제 #7
0
    def parse_detail_page(self, html):
        core_selector = css("div.core")
        doc = etree.HTML(html)

        core_div = core_selector(doc)
        if core_div:
            content_list = core_div[0]
        else:
            return

        post_contents = css("div.l_post")(content_list)
        main_author = None
        contents = []
        created_at = None
        for div in post_contents:
            dumped_data = div.get("data-field")
            try:
                data = json.loads(dumped_data)
            except:
                print "Parse json error"
                continue

            _content = data["content"]
            _author = data["author"]
            user_id = _author.get("id") or _author.get("name")
            main_author = user_id if main_author is None else main_author
            created_at = _content["date"]

            if main_author != user_id:
                break

            post_content = css("cc div.d_post_content")(div)[0]
            current_content = tostring(post_content,
                                       encoding="UTF-8",
                                       method="text")
            contents.append(current_content)

        self.title = css(".core_title_txt")(content_list)[0].text.encode(
            "UTF-8")
        self.content = "\n\n".join(contents)
        self.created_at = created_at
        self.save_to_sqlite()
예제 #8
0
def parse_summary_page(html):
    doc = etree.HTML(html)
    li_elms = css("li.j_thread_list")(doc)


    base_url = "http://tieba.baidu.com%s" 
    link_items = []
    for li in li_elms:
        link = css("a.j_th_tit")(li)[0]
        href = base_url % link.get('href')
        reply_str = css("span.j_reply_data")(li)[0].text
        link_items.append(PageItem(href, reply_str))

    next_page_links = css("a.next")(doc)
    if next_page_links:
        next_link = base_url % next_page_links[0].get("href")
    else:
        next_link = None

    return (next_link, link_items)
예제 #9
0
    def parse_detail_page(self, html):
        core_selector = css("div.core")
        doc = etree.HTML(html)

        core_div = core_selector(doc)
        if core_div:
            content_list = core_div[0]
        else:
            return

        post_contents = css("div.l_post")(content_list)
        main_author = None
        contents = []
        created_at = None
        for div in post_contents:
            dumped_data = div.get("data-field")
            try:
                data = json.loads(dumped_data)
            except:
                print "Parse json error"
                continue

            _content = data["content"]
            _author = data["author"]
            user_id = _author.get("id") or _author.get("name")
            main_author = user_id if main_author is None else main_author
            created_at = _content["date"]

            if main_author != user_id:
                break

            post_content = css("cc div.d_post_content")(div)[0]
            current_content = tostring(post_content, encoding="UTF-8", method="text")
            contents.append(current_content)

        self.title = css(".core_title_txt")(content_list)[0].text.encode("UTF-8")
        self.content = "\n\n".join(contents)
        self.created_at = created_at
        self.save_to_sqlite()
예제 #10
0
def parse():
    url = 'http://eu4.paradoxwikis.com/Countries'
    r = requests.get(url)
    parser = etree.HTML(r.content)

    #url = 'Countries - Europa Universalis 4 Wiki.html'
    #with open(url, 'r') as f:
    #    parser = etree.HTML(f.read())

    # CSS selectors
    sel = css('table.wikitable tr')

    flags = []

    for row in sel(parser):
        children = row.getchildren()

        tdflag = children[1]
        tdtag = children[2]

        flag = tdflag.cssselect('img')

        if len(flag) == 0:
            continue
        
        # Original sauce
        src = flag[0].get('src')
        src = src.split('/')
        del src[-1]
        del src[2]

        src = "/".join(src)

        tag = str(tostring(tdtag)[5:8])[2:-1]

        flags.append(str(tag) + "-" + src)

    with open(CACHE, 'w') as f:
        f.write("\n".join(flags))
        print('Wrote {0} countries to file'.format(len(flags)))
예제 #11
0
 def handle(self, *args, **options):
     if len(args) != 1:
         raise CommandError("Please specify a directory.")
     directory = args[0]
     
     if not os.path.exists(directory):
         os.mkdir(directory)
     
     # start scraper
     scraper = scrapelib.Scraper(requests_per_minute=60, allow_cookies=True, follow_robots=True)
     
     # open contractor CSV
     contractor_file = open(os.path.join(directory, 'contractors.csv'), 'wb')
     contractor_csv = csv.DictWriter(contractor_file, CONTRACTOR_FIELDS, restval='', extrasaction='ignore')
     contractor_csv.writer.writerow(CONTRACTOR_FIELDS)
     
     # first grab overall search page
     print 'Scraping main listing...'
     overall_text = scraper.urlopen("http://www.contractormisconduct.org/index.cfm/1,73,224,html?pnContractorID=0&pstDispositionTypeID=0&prtCourtTypeID=0&mcType=0&eaType=0&ContractType=0&dollarAmt=-1%2F-1&dateFrom=01%2F01%2F1985&dateTo=01%2F01%2F2025&submit=sort")
     overall_doc = document_fromstring(overall_text)
     
     # enumerate the organizations
     for org_option in css('select[name=pnContractorID] option')(overall_doc):
         if org_option.attrib['value'] != '0':
             contractor_csv.writerow({
                 'Contractor': org_option.text,
                 'URL': 'http://www.contractormisconduct.org/index.cfm/1,73,221,html?ContractorID=%s' % org_option.attrib['value']
             })
     
     contractor_file.close()
     
     # open instance CSV
     instance_file = open(os.path.join(directory, 'instances.csv'), 'wb')
     instance_csv = csv.DictWriter(instance_file, INSTANCE_FIELDS, restval='', extrasaction='ignore')
     instance_csv.writer.writerow(INSTANCE_FIELDS)
     
     # iterate over links from main page and grab their data
     links = css('td.caseRow a')(overall_doc)
     for i in range(len(links)):
         link = links[i]
         
         url = urlparse.urljoin("http://www.contractormisconduct.org/index.cfm/1,73,224,html", link.attrib['href'])
         
         print 'Scraping %s (%s of %s)' % (url, i + 1, len(links))
         
         instance_text = scraper.urlopen(url)
         instance_doc = document_fromstring(UnicodeDammit(instance_text, isHTML=True).unicode)
         
         row = {
             'Contractor': css('#primecontent > h2')(instance_doc)[0].text,
             'Instance': sanitize(css('#incident > h2')(instance_doc)[0].text),
             'URL': url
         }
         
         for field in css('#incident > p > strong')(instance_doc):
             field_name = field.text.replace(':', '')
             field_contents = sanitize(field.tail.strip())
             
             if field_name == 'Date':
                 date_parts = field_contents.split(None, 1)
                 row['Date'] = date_parts[0]
                 row['Year'] = row['Date'].split('/')[-1]
                 row['Significance of Date'] = date_parts[1][1:-1] if len(date_parts) > 1 else ''
             elif field_name == 'Amount':
                 row['Misconduct Penalty Amount'] = field_contents.replace('$', '').replace(',', '') if DOLLARS.match(field_contents) else ''
             else:
                 row[field_name] = field_contents
         
         instance_csv.writerow(row)
     
     instance_file.close()
예제 #12
0
    def get_image_4chan(self, path, staging, rate_limit=0):
        """
		Create image from path

		If the path is local, simply read the local path and return an Image
		representing it. If not, attempt to download the image from elsewhere,
		and cache the downloaded result if possible, else discard the file
		afterwards.

		:param path:  Path to image, either a local path or a URL
		:param rate_limit:  Seconds to wait after downloading, if downloading

		:return Image:  Image object, or nothing if loading it failed
		"""
        # do we have the file locally?
        filename = Path(config.PATH_IMAGES, path.split("/")[-1])
        if filename.exists():
            return Image.open(str(filename))

        while self.previous_download > time.time() - rate_limit:
            time.sleep(0.1)

        self.previous_download = time.time()

        rate_regex = re.compile(
            r"Search limit exceeded. Please wait ([0-9]+) seconds before attempting again."
        )

        # get link to image from external HTML search results
        # detect rate limiting and wait until we're good to go again
        page = requests.get(path)
        rate_limited = rate_regex.search(page.content.decode("utf-8"))

        while rate_limited:
            self.log.debug(
                "Rate-limited by external source. Waiting %s seconds." %
                rate_limited[1])
            time.sleep(int(rate_limited[1]))
            page = requests.get(path)
            rate_limited = rate_regex.search(page.content.decode("utf-8"))

        # get link to image file from HTML returned
        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(page.content.decode("utf-8")), parser)
        image_url = css("a.thread_image_link")(tree)[0].get("href")

        # download image itself
        image = requests.get(image_url, stream=True)
        if image.status_code != 200:
            raise FileNotFoundError

        # cache the image for later, if needed
        if config.PATH_IMAGES and Path(config.PATH_ROOT,
                                       config.PATH_IMAGES).exists():
            md5 = hashlib.md5()

            based_hash = path.split("/")[-1].split(".")[0].replace("_", "/")
            extension = image_url.split(".")[-1].lower()
            md5.update(base64.b64decode(based_hash))

            local_path = Path(config.PATH_IMAGES,
                              md5.hexdigest() + "." + extension)
            delete_after = False
        else:
            local_path = staging.joinpath("temp-image")
            delete_after = True

        # save file, somewhere
        with local_path.open('wb') as file:
            for chunk in image.iter_content(1024):
                file.write(chunk)

        # avoid getting rate-limited by image source
        time.sleep(rate_limit)
        picture = Image.open(local_path)

        # if no image folder is configured, delete the temporary file
        if delete_after:
            local_path.unlink()

        return picture
예제 #13
0
    def handle(self, *args, **options):
        if len(args) != 1:
            raise CommandError("Please specify a directory.")
        directory = args[0]

        if not os.path.exists(directory):
            os.mkdir(directory)

        # start scraper
        scraper = scrapelib.Scraper(requests_per_minute=60,
                                    allow_cookies=True,
                                    follow_robots=True)

        # open contractor CSV
        contractor_file = open(os.path.join(directory, 'contractors.csv'),
                               'wb')
        contractor_csv = csv.DictWriter(contractor_file,
                                        CONTRACTOR_FIELDS,
                                        restval='',
                                        extrasaction='ignore')
        contractor_csv.writer.writerow(CONTRACTOR_FIELDS)

        # first grab overall search page
        print 'Scraping main listing...'
        overall_text = scraper.urlopen(
            "http://www.contractormisconduct.org/index.cfm/1,73,224,html?pnContractorID=0&pstDispositionTypeID=0&prtCourtTypeID=0&mcType=0&eaType=0&ContractType=0&dollarAmt=-1%2F-1&dateFrom=01%2F01%2F1985&dateTo=01%2F01%2F2025&submit=sort"
        )
        overall_doc = document_fromstring(overall_text)

        # enumerate the organizations
        for org_option in css('select[name=pnContractorID] option')(
                overall_doc):
            if org_option.attrib['value'] != '0':
                contractor_csv.writerow({
                    'Contractor':
                    org_option.text,
                    'URL':
                    'http://www.contractormisconduct.org/index.cfm/1,73,221,html?ContractorID=%s'
                    % org_option.attrib['value']
                })

        contractor_file.close()

        # open instance CSV
        instance_file = open(os.path.join(directory, 'instances.csv'), 'wb')
        instance_csv = csv.DictWriter(instance_file,
                                      INSTANCE_FIELDS,
                                      restval='',
                                      extrasaction='ignore')
        instance_csv.writer.writerow(INSTANCE_FIELDS)

        # iterate over links from main page and grab their data
        links = css('td.caseRow a')(overall_doc)
        for i in range(len(links)):
            link = links[i]

            url = urlparse.urljoin(
                "http://www.contractormisconduct.org/index.cfm/1,73,224,html",
                link.attrib['href'])

            print 'Scraping %s (%s of %s)' % (url, i + 1, len(links))

            instance_text = scraper.urlopen(url)
            instance_doc = document_fromstring(
                UnicodeDammit(instance_text, isHTML=True).unicode)

            row = {
                'Contractor': css('#primecontent > h2')(instance_doc)[0].text,
                'Instance':
                sanitize(css('#incident > h2')(instance_doc)[0].text),
                'URL': url
            }

            for field in css('#incident > p > strong')(instance_doc):
                field_name = field.text.replace(':', '')
                field_contents = sanitize(field.tail.strip())

                if field_name == 'Date':
                    date_parts = field_contents.split(None, 1)
                    row['Date'] = date_parts[0]
                    row['Year'] = row['Date'].split('/')[-1]
                    row['Significance of Date'] = date_parts[1][1:-1] if len(
                        date_parts) > 1 else ''
                elif field_name == 'Amount':
                    row['Misconduct Penalty Amount'] = field_contents.replace(
                        '$', '').replace(
                            ',', '') if DOLLARS.match(field_contents) else ''
                else:
                    row[field_name] = field_contents

            instance_csv.writerow(row)

        instance_file.close()
예제 #14
0
def get_els(page_html, selector='.athing'):
  for tr in css(selector)(html.fromstring(page_html)):
    a = css('a')(tr.getchildren()[2])[0]
    yield tr, a
예제 #15
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a new CSV file
		with all posts containing the original query exactly, ignoring any
		* or " in the query
		"""
        months = {}

        # we use these to extract URLs and host names if needed
        link_regex = re.compile(r"https?://en.wikipedia\.org/wiki/[^\s.]+")
        wiki_page = re.compile(r"[\[\[[^\]]+\]\]")
        category_regex = re.compile(r"\[\[Category:[^\]]+\]\]")
        trailing_comma = re.compile(r",$")

        # initialise
        links = {}
        all_categories = {}
        counter = 1
        errors = 0
        page_categories = {}
        page_links = {}
        deep_pages = {}

        # find all links in post bodies
        self.dataset.update_status("Reading source file")
        for post in self.iterate_csv_items(self.source_file):
            wiki_links = link_regex.findall(post["body"])
            wiki_links = [trailing_comma.sub("", link) for link in wiki_links]

            # if we have a per-post URL, include that as well
            if "url" in post and post["url"] and link_regex.match(post["url"]):
                wiki_links.append(post["url"])

            for link in wiki_links:
                link = "/wiki/".join(link.split("/wiki/")[1:]).split("#")[0]
                if link not in links:
                    links[link] = 0

                links[link] += 1

        # just a helper function to get the HTML content of a node
        def stringify_children(node):
            from lxml.etree import tostring
            from itertools import chain
            parts = ([node.text] + list(
                chain(*([c.text, tostring(c), c.tail]
                        for c in node.getchildren()))) + [node.tail])
            # filter removes possible Nones in texts and tails
            return ''.join(filter(None, parts))

        self.dataset.update_status("Fetching categories from Wikipedia API...")
        for link in links:
            if link not in page_categories:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while fetching data from Wikipedia")

                page_categories[link] = set()
                self.dataset.update_status(
                    "Fetching categories from Wikipedia API, page %i of %i" %
                    (counter, len(links)))
                counter += 1

                # fetch wikipedia source
                url = "https://en.wikipedia.org/w/index.php?title=" + link + "&action=edit"
                try:
                    page = requests.get(url)
                except requests.RequestException:
                    errors += 1
                    continue

                if page.status_code != 200:
                    errors += 1
                    continue

                # get link to image file from HTML returned
                parser = etree.HTMLParser()
                tree = etree.parse(StringIO(page.content.decode("utf-8")),
                                   parser)

                try:
                    wiki_source = stringify_children(
                        css("#wpTextbox1")(tree)[0])
                except IndexError:
                    # not a source page?
                    errors += 1
                    continue

                # extract category names from category link syntax
                categories = category_regex.findall(wiki_source)
                categories = set([
                    ":".join(category.split(":")[1:])[:-2].split("|")[0]
                    for category in categories
                ])

                # save category links
                for category in categories:

                    # Add " (cat)" to the category strings.
                    # This is needed because pages can sometimes have the same name as the category.
                    # This will result in a faulty graph, since there's duplicate nodes.

                    category += " (cat)"

                    if category not in all_categories:
                        all_categories[category] = 0

                    all_categories[category] += 1
                    page_categories[link].add(category)

                # if needed, also include pages linked to from within the
                # fetched page source
                if self.parameters.get("deep_pages", None):
                    linked_pages = wiki_page.findall(wiki_source)
                    for page in linked_pages:
                        page = page.split("|")[0]

                        if page not in deep_pages:
                            deep_pages[page] = 0

                        deep_pages[page] += 1

                        if link not in page_links:
                            page_links[link] = set()

                        page_links[link].add(page)

        # write GDF file
        with self.dataset.get_results_path().open("w",
                                                  encoding="utf-8") as results:
            results.write("nodedef>name VARCHAR,type VARCHAR,weight INTEGER\n")
            for page in page_categories:
                results.write("'" + page.replace("_", " ").replace(",", "") +
                              "',page," + str(links[page]).replace(",", "") +
                              "\n")

            for category in all_categories:
                results.write("'" +
                              category.replace("_", " ").replace(",", "") +
                              "',category," +
                              str(all_categories[category]).replace(",", "") +
                              "\n")

            results.write(
                "edgedef>node1 VARCHAR, node2 VARCHAR, weight INTEGER\n")
            for page in page_categories:
                for category in page_categories[page]:
                    results.write("'" +
                                  page.replace("_", " ").replace(",", "") +
                                  "','" +
                                  category.replace("_", " ").replace(",", "") +
                                  "'\n")

        self.dataset.finish(len(page_categories))
 def test_form_in_response(self):
     code, html = self.get_html()
     self.assertTrue(code, 200)
     self.assertTrue(css('.id_mock_image-apply_mock_image')(html))
     self.assertTrue(css('#mock_image_params')(html))
예제 #17
0
def process_html(text):
    tree = html.fromstring(text)
    for img in css("img")(tree):
        src = img.get("src")
        print "src: %s" % src
 def test_one_sub_form_in_response(self):
     code, html = self.get_html()
     self.assertTrue(code, 200)
     self.assertTrue(css('#single_form_template')(html))
예제 #19
0
             'checkout': '05%2F05%2F2017',
             'neighborhoods[]': '%s' % (hood),
         }
     },
     'active': 1
 } for hood in neighborhoods],
 'to_parser': {
     'raw_html': False,
     'object_types': {
         'next_page': {
             'objects': {
                 'next_page': {
                     'parse_func': sel_attr,
                     'kwargs': {
                         'attr': 'href',
                         'selector': css('.next_page a'),
                     },
                     'follow': 1,
                 },
             },
         },
         'airbnb_listing': {
             'pre_selector': css('.listing'),
             'objects': {
                 'url': {
                     'parse_func': sel_attr,
                     'kwargs': {
                         'attr': 'data-url',
                     },
                 },
                 'reviews': {
예제 #20
0
def comment_els(comment_html, text_class='.c00', match_on='.*amazon\.co.*\/.*|.*amzn\.co.*\/.*'):
  for comment in css(text_class)(html.fromstring(comment_html)):
    links = css('a')(comment)
    for link in links:
      if 'reply' not in link.text and re.match(match_on, link.get('href')) and 'aws.' not in link.get('href'):
        yield comment, link
예제 #21
0
dumpert_class = [{
    'name':
    'Dumpert',
    'domain':
    'http://dumpert.nl',
    'num_get':
    1,
    'phases': [{
        'to_getter': [{
            'url': 'http://dumpert.nl/{}/'.format(i if i else ''),
            'active': 1
        } for i in range(2)],
        'to_parser': {
            'object_types': {
                'upload': {
                    'pre_selector': css('a.dumpthumb'),
                    'to_store': {
                        'func': store_json,
                        'kwargs': {
                            'filename': 'dumpert',
                        }
                    },
                    'attrs': {
                        'url': {
                            'func': sel_attr,
                            'kwargs': {
                                'attr': 'href',
                            }
                        },
                        'title': {
                            'func': sel_text,
예제 #22
0
from lxml import html
from lxml.cssselect import CSSSelector as css
from urlparse import urljoin
from topps import scraper
from math import floor
from topps import util

db = util.connect_db(scraper)

cursor = db.cursor()

br = mechanize.Browser()

index_tree = html.fromstring(br.open("http://www.nfl.com/players").read())

get_team_links = css("#byTeamRosterTable a")
get_divisions = css("#byTeamRosterTable .bold")
team_links = [(anchor.text, anchor.get('href')) for anchor in get_team_links(index_tree)]

divisions = [div.text.split(" ") for div in get_divisions(index_tree)]
conferences = set([div[0] for div in divisions])
conference_ids = []

# for conf in conferences: 
#     cursor.execute("""INSERT INTO conference (name) VALUES ("{0}");""".format(conf))
#     print cursor.fetchall()

# for division in divisions:
#     print division[0], division[1]
#     cursor.execute("""INSERT INTO division (name, conference_name) VALUES ("{1}", "{0}");""".format(division[0], " ".join(division)))
#     db.commit()
예제 #23
0
	def get_image(self, path, rate_limit=0):
		"""
		Create image from path

		If the path is local, simply read the local path and return an Image
		representing it. If not, attempt to download the image from elsewhere,
		and cache the downloaded result if possible, else discard the file
		afterwards.

		:param path:  Path to image, either a local path or a URL
		:param rate_limit:  Seconds to wait after downloading, if downloading

		:return Image:  Image object, or nothing if loading it failed
		"""
		rate_regex = re.compile(r"Search limit exceeded. Please wait ([0-9]+) seconds before attempting again.")

		if isinstance(path, Path):
			# local file
			return Image.open(path)

		# get link to image from external HTML search results
		# detect rate limiting and wait until we're good to go again
		page = requests.get(path, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15"})
		rate_limited = rate_regex.search(page.content.decode("utf-8"))

		while rate_limited:
			self.log.debug("Rate-limited by external source. Waiting %s seconds." % rate_limited[1])
			time.sleep(int(rate_limited[1]))
			page = requests.get(path)
			rate_limited = rate_regex.search(page.content.decode("utf-8"))

		# get link to image file from HTML returned
		parser = etree.HTMLParser()
		tree = etree.parse(StringIO(page.content.decode("utf-8")), parser)
		image_url = css("a.thread_image_link")(tree)[0].get("href")

		# download image itself
		image = requests.get(image_url, stream=True)

		# if not available, the thumbnail may be
		if image.status_code != 200:
			thumbnail_url = ".".join(image_url.split(".")[:-1]) + "s." + image_url.split(".")[-1]
			image = requests.get(thumbnail_url, stream=True)

		if image.status_code != 200:
			raise FileNotFoundError

		# cache the image for later, if needed
		if config.PATH_IMAGES:
			md5 = hashlib.md5()

			based_hash = path.split("/")[-1].split(".")[0].replace("_", "/")
			extension = image_url.split(".")[-1].lower()
			md5.update(base64.b64decode(based_hash))

			local_path = Path(config.PATH_IMAGES, md5.hexdigest() + "." + extension)
			delete_after = False
		else:
			query_result = self.dataset.get_results_path()
			local_path = Path(query_result.parent, query_result.name + "-temp")
			delete_after = True

		# save file, somewhere
		with open(local_path, 'wb') as file:
			for chunk in image.iter_content(1024):
				file.write(chunk)

		# avoid getting rate-limited by image source
		time.sleep(rate_limit)
		picture = Image.open(local_path)

		# if no image folder is configured, delete the temporary file
		if delete_after:
			local_path.unlink()

		return picture
예제 #24
0
from lxml.cssselect import CSSSelector as css
import time

clss = {
    'mediamarkt': {
        'start': [
            {
                'url': '/scholenoverzicht/vo/',
                'active': 1,
            },
        ],
        'list_url': 'http://www.onderwijsconsument.nl',
        'object_url': 'http://www.onderwijsconsument.nl',
        # 'iter_class': css('li.pagination-next a'),
        'css': {
            'list_class': css('#lijst .school a'),
            'sections': {
                'school': {
                    'selector': css('#school'),
                    'css': {
                        'onderwijs': {
                            'func': sel_text,
                            'params': {
                                'selector': css('.lead'),
                            },
                        },
                        'name': {
                            'func': sel_text,
                            'params': {
                                'selector': css('h2')
                            }
예제 #25
0
from parse_functions import *  # noqa
from lxml.cssselect import CSSSelector as css

clss = {
    'lyrics': {
        'skip_object': '',
        'start': [
            {
                'url': 'wiki/LyricWiki:Top_100',
                'active': True,
            },
        ],
        'list_url': 'http://lyrics.wikia.com/',
        'object_url': 'http://lyrics.wikia.com/',
        'css': {
            'list_class': css('li b a:not(a.new)'),
            'sections': {
                'lyrics': {
                    'artist': {
                        'func': parse_attr,
                        'params': {
                            'attr': 'content',
                            'selector': css('meta[property=title]')
                        }
                    },
                    'lyric': {
                        'func': parse_regex,
                        'params': {
                            'selector': css('.lyricbox'),
                            'regex': ";([a-zA-z\d.,? '’\"!\(\)-]*)\n",
                        }
예제 #26
0
clss = [
        {
        'name': 'mediamarkt',
        'domain': 'http://www.mediamarkt.nl',
        'phases': [
            {
            'to_getter': [
                {'url': 'http://www.mediamarkt.nl/',
                'active': 1,
                },
                ],
            'to_parser': {
                'object_types': {
                    'link': {
                        'pre_selector': css('#top-navigation'),
                        'attrs': {
                            'menu_item': {
                                'func': sel_attr,
                                'kwargs': {
                                    'selector': css('li.item a'),
                                    'index': 0,
                                    'attr': 'href'},
                                'follow': {'forward': 1},
                                },
                            },
                        },
                    }
                },
            },
            {'to_parser': {