Пример #1
0
    def test_calendar_tag_rendering(self, timezone_mock):
        timezone_mock.now.return_value = tz_datetime(2015, 1, 10, 12)
        page_with_apphook = self.create_base_pages()
        other_config = EventsConfig.objects.create(namespace='other')
        self.create_event(
            title='ev1',
            start_date=tz_datetime(2015, 1, 13),
            publish_at=tz_datetime(2015, 1, 10)
        )
        self.create_event(
            title='ev2',
            start_date=tz_datetime(2015, 1, 15),
            publish_at=tz_datetime(2015, 1, 10)
        )
        self.create_event(
            de=dict(
                title='ev3',
                start_date=tz_datetime(2015, 1, 16),
                publish_at=tz_datetime(2015, 1, 10)
            )
        )
        self.create_event(
            title='ev4',
            start_date=tz_datetime(2015, 1, 18),
            publish_at=tz_datetime(2015, 1, 10),
            app_config=other_config
        )
        self.create_event(
            title='ev5',
            start_date=tz_datetime(2015, 1, 22),
            end_date=tz_datetime(2015, 1, 27),
            publish_at=tz_datetime(2015, 1, 10)
        )
        self.create_event(
            title='ev6',
            start_date=tz_datetime(2015, 1, 25),
        )
        # make use of default tests self.app_config namespace, instead of
        # hard coding it
        template_str = """
        {%% load aldryn_events %%}
        {%% calendar 2015 1 'en' '%s' %%}
        """ % self.app_config.namespace
        t = Template(template_str)
        with override('en'):
            html = t.render(SekizaiContext({}))
            table = PyQuery(html)('table.table-calendar')
            page_url_en = page_with_apphook.get_absolute_url()
        links = table.find('td.events, td.multiday-events').find('a')

        # test if tag rendered important elements
        self.assertEqual('1', table.attr('data-month-numeric'), )
        self.assertEqual('2015', table.attr('data-year'))
        self.assertEqual('10', table.find('td.today').text())
        self.assertEqual(8, links.length)  # 13, 15, 22, 23, 24, 25, 26, 27
        expected_days = (13, 15, 22, 23, 24, 25, 26, 27)
        for position, day in enumerate(expected_days):
            event_url = '{0}2015/1/{1}/'.format(page_url_en, day)
            rendered_url = links[position].attrib['href']
            self.assertEqual(event_url, rendered_url)
Пример #2
0
def find_external_links(url):
    '''Look for links to files in a web page and returns a set.
    '''
    links = set()
    try:
        response = get(url)
        if response.status_code != 200:
            app.logger.warning('Error while getting proxy info for: %s'
                               'Errors details: %s', url,
                               response.text)
        else:
            if response.content:
                p = PyQuery(response.content)
                for anchor in p("a"):
                    panchor = PyQuery(anchor)
                    href = panchor.attr("href")
                    if url_is_egg_file(href):
                        # href points to a filename
                        href = get_absolute_url(href, url)
                        links.add('<a href="%s">%s</a>' % (href, panchor.text()))
    except:
        # something happened when looking for external links: 
        #       timeout, HTML parser error, etc.
        # we must not fail and only log the error
        app.logger.exception('')
    return links
Пример #3
0
    def scrape_homepage(self, **kwargs):
        """
        Scrape!
        """
        logger.info('Scraping homepage (start time: %s)' % self.run_time)

        if not kwargs:
            response = requests.get(self.url)

            page = PyQuery(response.content)
        else:
            page = PyQuery(**kwargs)

        article_elements = page('.stories-wrap article')
        slot = 0
        articles = []

        for el in article_elements:
            element = PyQuery(el)

            article = Article(element, self.run_time)

            if not article.story_id and not article.is_apps_project:
                continue

            if not element.hasClass('attachment'):
                slot += 1

            article.slot = slot
            articles.append(article)
            logger.info('Scraped %s from homepage (%s)' % (article.story_id, article.headline))

        return articles
Пример #4
0
def test_render_attribute_to_document():
    document = PyQuery('<a attribute="{ value }" data-riot-id="0"></a>')
    expression =  {'expression': '{ value }', 'attribute': 'attribute', 'type': 'attribute', 'node': document}
    render_document([expression], {'value': 'value'})
    assert document.outer_html() == '<a attribute="value" data-riot-id="0" data-riot-dirty="true"></a>'
    render_document([expression], {'value': 1})
    assert document.outer_html() == '<a attribute="1" data-riot-id="0" data-riot-dirty="true"></a>'
Пример #5
0
def html_to_records(html):
    pq = PyQuery(html)
    rows = pq.find('table tr')
    get_row = lambda r: map(lambda th: th.text, r)
    headers = get_row(rows[0])
    for row in rows[1:]:
        yield dict(zip(headers, get_row(row)))
Пример #6
0
    def extract(self):
        self.html = re.sub("<!--.*?-->", "", self.html)
        doc = PyQuery(self.html)
        content_node = doc("div#blog_article_content")

        content = content_node.outerHtml()
        cpl = re.compile('<img.*?src=".*?"', re.I)
        content = re.sub("%", "%%", content)
        content_doc = PyQuery(content)
        content_doc("img").attr("src", "%s")

        item = ContentItem()
        item["title"] = self.title = doc("div.blog_main_left_content").find("h3").text()
        item["author"] = self.author = doc("div#common_person_blogtitle")("div#title01")("a").text()

        item["content"] = self.content = content_doc.outerHtml()

        self.release_time = doc("div.blog_main_time").find("p").text().strip()
        item["release_time"] = self.release_time

        item["source"] = u"凤凰网"
        item["pic_url"] = ""

        item["image_urls"] = [img.get("src") for img in content_node("img")]

        return item
Пример #7
0
def get_saml_response(response):
    tree = PyQuery(response.content)
    inputtag = tree.find('input[name="SAMLResponse"]')
    assert len(inputtag) == 1
    encoded_response = inputtag[0].get('value')
    samlresponse = base64.b64decode(encoded_response)
    return samlresponse
Пример #8
0
    def next(self):
        if self.i == self.categories_iter.length:
            raise StopIteration

        link = self.categories_iter[self.i]

        py_link = PyQuery(link)
        href = py_link.attr('href')
        html_class = href.split('/')[-1:][0]
        title = py_link.text()
        thumbnail_url = self.crawler.baseurl + PyQuery(link).find('img').attr('src')
        url = self.crawler.category_url % href

        category = Category(title, url, html_class, thumbnail_url)
        shows = Shows(self.crawler, url)

        tmp = list()
        tmp.append(shows)

        if title == 'Nyheter':
            news_url = self.crawler.news_url % href
            news_shows = Shows(self.crawler, news_url)
            tmp.append(news_shows)

        category.shows = itertools.chain(*tmp)

        self.i += 1
        return category
Пример #9
0
def get_urls():
    doc = PyQuery('https://www.peterbe.com/plog/')
    doc.make_links_absolute(base_url='https://www.peterbe.com')
    urls = []
    for a in doc('dd a'):
        urls.append(a.attrib['href'])
    return urls
Пример #10
0
def extract_sample_and_codes_using_css(doc, sample_css_class):
    sample_columns = {}
    variable_column = None

    sample_columns = {}
    code_column = None
    label_column = None
    table = doc('#dataTable > .variablesList')
    for i, th in enumerate(table('tr.fullHeader:first > th')):
        th = PyQuery(th)
        if code_column is None and th.hasClass('codesColumn'):
            code_column = i
        elif label_column is None and th.hasClass('labelColumn'):
            label_column = i
        elif th.hasClass(sample_css_class):
            sample_columns[i] = th.text()
            
    # Extract actual values for the variables.
    variable_info = []
    for row in table('tr.variables'):
        columns = PyQuery(row)('td')
        code = PyQuery(columns[code_column]).text().strip()
        label = PyQuery(columns[label_column]).text().strip()
        availability = [smpl.strip()
                        for i, smpl in sample_columns.items() 
                        if PyQuery(columns[i]).text().strip() != '.']
        variable_info.append({'code': code,
                              'label': label,
                              'availability': availability})

    return variable_info, len(sample_columns)
Пример #11
0
def getResTb():
    html = fileworker.getHTML()
    pq = PyQuery(html)
    result = dict()
    blocks = list()
    for i in pq.items('.row.result'):
        list.append(i)
def crawl_vvic_category_tree(wb):
    h = httplib2.Http()
    response, content = h.request("http://www.vvic.com/")
#     fw = open("C:users/chenweiqiang/desktop/vvic2.html", "w")
#     fw.write(content)
#     fw.close()
    ws = wb.add_sheet("vvic品类树")
    ws.write(0,0,"一级品类")
    ws.write(0,1,"二级品类")
    ws.write(0,2,"三级品类")
    row = 0
    doc = PyQuery(content)
    level1NodeList = doc("div.dd-inner > div.item")
    anotherLevel1NodeList = [doc('div.sub-items')[0], doc('div.sub-items')[1], doc('div.sub-items')[2], doc('div.sub-items')[5]]
    for index, level1Node in enumerate(level1NodeList):
        level1_category = PyQuery(level1Node)('h3 > a').text()
        level2NodeList = PyQuery(anotherLevel1NodeList[index]).children('dl')
        for level2Node in level2NodeList:
            level2NodeQ = PyQuery(level2Node)
            level2_category = level2NodeQ.children('dt > a').text()
            level3NodeList = level2NodeQ.children('dd > a')
            for level3Node in level3NodeList:
                level3_category = PyQuery(level3Node).text()
                row += 1
                ws.write(row, 0, level1_category)
                ws.write(row, 1, level2_category)
                ws.write(row, 2, level3_category)
def crawl_1688_category_tree(wb):
    #fr = open("C:users/chenweiqiang/desktop/ye.html", "r") #PyQuery之后取不出来元素
    h = httplib2.Http()
    response, content = h.request("https://ye.1688.com/")
#     fw = open("C:users/chenweiqiang/desktop/ye2.html", "w")
#     fw.write(content)
#     fw.close()
    ws = wb.add_sheet("ye.1688品类树")
    ws.write(0,0,"一级品类")
    ws.write(0,1,"二级品类")
    ws.write(0,2,"三级品类")
    row = 0
    doc = PyQuery(content)
    level1NodeList = doc("li.cat-box")
    for level1Node in level1NodeList:
        level1NodeQ = PyQuery(level1Node)
        level1_category = level1NodeQ('div.cat-main').text().replace(' ', '')
        level2NodeList = level1NodeQ('div.cat-sub-col > dl') # 多余div[class="cat-sub "] > 
        for level2Node in level2NodeList:
            level2NodeQ = PyQuery(level2Node)
            level2_category = level2NodeQ('dt > a').text()
            level3NodeList = level2NodeQ('dd.cat-list > ul > li > a')
            for level3Node in level3NodeList:
                level3NodeQ = PyQuery(level3Node)
                level3_category = level3NodeQ.text()
                row += 1
                ws.write(row, 0, level1_category)
                ws.write(row, 1, level2_category)
                ws.write(row, 2, level3_category)
Пример #14
0
def station_parse(content):
    '''Parsing bus station and check station.
    '''
    OFF = '0-0'
    stations = []
    bus_status = []
    content = json.loads(content[3:].decode('utf-8'))
    status = content['status']
    info = content['info']
    if status == 1 and info != '':
        pqContent = PyQuery(info)('#upInfo li')
        for station in pqContent:
            pqStation = PyQuery(station)
            station_name = pqStation('.station').text()
            stations.append(station_name)
            buses = pqStation.find('.bus')
            if buses.size() > 0:
                left_count = 0
                on_station_count = 0
                for bus in buses:
                    if PyQuery(bus).attr('style'):
                        left_count+=1
                    else:
                        on_station_count+=1
                bus_status.append('{0}-{1}'.format(on_station_count, left_count))
            else:
                bus_status.append(OFF)
    if not stations:
        return None

    return (tuple(bus_status), tuple(stations))
Пример #15
0
def parseProductPage(product, need_img_urls=False):
    """进入商品详情页, 抓取四个新字段
       delivery reviews star total_sales
    """
    if product['product_url']:
       content = fetchContent(product['product_url'], False)
       doc=PyQuery(content)
       #product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了
       product['reviews'] = doc('p.satisfaction-number > a > em.value').text()
       product['star'] = doc('p.star-level > i').attr("class")
       product['total_sales'] = doc('p.bargain-number > a > em.value').text()
       if need_img_urls:
           url_list = get_img_urls(content)
           product['img_urls'] = ', '.join(url_list)
       else:
           product['img_urls'] = ''
       product['color'], product['size'] = '', ''
       for index, td in enumerate(doc('div.obj-content > table > tbody > tr > td')):
            tdQ = PyQuery(td)
            if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'颜色':
                product['color'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text()
            if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'尺寸':
                product['size'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text()
       product['MOQ'] = extractNum(doc('tr.amount > td.ladder-1-1 > span.value').text().replace(u"≥", ""))
       if not product['MOQ'] or product['MOQ'] == 0:
           product['MOQ'] = extractNum(PyQuery(doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text())
       if product['MOQ'] == 1:
           #print product['product_url']
           product['sku_size'] = PyQuery(doc('div.unit-detail-spec-operator').eq(0))('span.text').text()
           product['sku_color'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.name').text()
           product['sku_price'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.price').text()
           product['sku_amount'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.count > span > em.value').text()
           print product['sku_id'], '\t', product['sku_size'], "\t", product['sku_color'], "\t", product['sku_price'], "\t", product['sku_amount']
    return product
Пример #16
0
def parse(html):
    '''return a list of dictionaries describing the stories on the front page'''
    elements = []
    p = PyQuery(html)
    # 90s markup woohoo!
    anchors = p('.title:nth-child(3) a:nth-child(1)')
    for a in anchors:
        # have to re-wrap here, because PyQuery just exposes internal lxml objects upon getting iterated
        a = PyQuery(a)
        subtext = a.closest('tr').next().find('.subtext')
        if not subtext:
            # More link
            continue
        children = map(PyQuery, subtext.children())
        try:
            span, submitted, comments = children[0], children[1], children[-1]
        except IndexError:
            # filter out ads
            continue
        comments = comments.text().rpartition(' ')[0]
        comments = int(comments) if comments else 0
        url = a.attr('href')
        elements.append({
                      'pos': len(elements) + 1,
                    'title': a.text(),
                      'url': url,
                   'domain': urlparse(url).netloc.rpartition('www.')[2],
                 'comments': comments,
                'submitter': submitted.text(),
                   'points': int(span.text().split()[0]),
                       'id': int(span.attr('id').split('_', 1)[1]),
                      'ago': submitted[0].tail.split('ago')[0].strip(),
                })
    logging.warning('parsed %s elements', len(elements))
    return elements
Пример #17
0
def _split(inputfile, outputdir):
    source = open(inputfile, 'r')
    html = source.read()
    source.close()

    if not os.path.isdir(outputdir):
        os.mkdir(outputdir)

    idx_slide=0
    idx_section=0

    parsed = PyQuery(html)
    
    for section in parsed('section'):
        slide = PyQuery(section)        
        if slide.has_class('stack'):
            idx_section+=1
            stack_path = os.path.join(outputdir,'%02d' % idx_section )
            os.mkdir(stack_path)
            for sub_slide in PyQuery(slide.html())('section'):
                idx_slide+=1
                _dump_slide(sub_slide, idx_slide, stack_path)
        else: 
            if not slide.parent().has_class('stack'):
                idx_slide+=1
                _dump_slide(slide, idx_slide, outputdir)                    
    def build_tree(self, doc, xpath, parent=None, prefix=""):
        tree = []
        nodes = doc.xpath(xpath)

        for (i, node) in enumerate(nodes):
            link = node.xpath(self.link_xpath)
            if not link:
                continue

            title, url = link[0].text, link[0].get("href")
            if not title:
                # when <code> inside it, link[0].text dose not work properly
                html = etree.tostring(link[0]).decode()
                pq = PyQuery(html)
                title = pq.text()
            title = re.sub(r"\s+", " ", title).strip()
            tree_node = TreeNode(title, url)
            tree_node.children = self.build_tree(node, self.children_xpath, tree_node,
                                                 "{}{}.".format(prefix, i + 1))
            if url:
                tree_node.abs_url = urllib.parse.urljoin(self.index_url, tree_node.url)
                # tree_node.save_to = "{}{}_{}.html".format(prefix, i + 1, filter_illegal_path_chars(tree_node.title))
                tree_node.save_to = filter_illegal_path_chars(url_path_to_filename(tree_node.abs_url))
            elif tree_node.children:
                # if parent doesn't have url, then set it to it's first child's url
                tree_node.abs_url = tree_node.children[0].abs_url
                tree_node.save_to = tree_node.children[0].save_to
            else:
                self.logger.warning("no children and no link?")
                continue

            tree.append(tree_node)
        return tree or None
def getPageLinkIfValid(element, currentPageNumber):
    pyElement = PyQuery(element)
    pageNumberText = pyElement.find('span').text()

    if pageNumberText.isdigit() and int(pageNumberText) > currentPageNumber:
        return 'https://www.youtube.com' + pyElement.attr('href')
    return None
Пример #20
0
def get_smileys():
    global n
    logging.info('Récupération des émoticones')

    n = 0

    d = PyQuery(url=config.rooturl+'/admin/index.forum?part=themes&sub=avatars&mode=smilies&extended_admin=1&' + tid, opener=fa_opener)
    result = re.search('function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}', d.text())

    try:
        pages = int(result.group(1))
        usersperpages = int(result.group(2))
    except:
        pages = 1
        usersperpages = 0
        
    if config.debug:
        progress = progressbar.NoProgressBar()
    else:
        progress = progressbar.ProgressBar(widgets=[BarVar(), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()], maxval=pages-1)
    progress.start()

    for page in range(0,pages):
        if page >= 1:
            d = PyQuery(url=config.rooturl + '/admin/index.forum?part=themes&sub=avatars&mode=smilies&extended_admin=1&start=' + str(page*usersperpages) + '&' + tid, opener=fa_opener)
        
        for i in d('table tr'):
            e = PyQuery(i)
            if e("td").eq(0).text() != None and e("td").eq(0).attr("colspan") == None:
                save.smileys[e("td").eq(0).text()] = e("td").eq(1).text()
                n += 1
        progress.update(page)

    progress.end()
Пример #21
0
def process_file(filename):
    """Read a file from disk and parse it into a structured dict."""
    try:
        with codecs.open(filename, encoding='utf-8', mode='r') as f:
            file_contents = f.read()
    except IOError as e:
        log.info('Unable to index file: %s, error :%s', filename, e)
        return
    data = json.loads(file_contents)
    sections = []
    title = ''
    body_content = ''
    if 'current_page_name' in data:
        path = data['current_page_name']
    else:
        log.info('Unable to index file due to no name %s', filename)
        return None
    if 'body' in data and data['body']:
        body = PyQuery(data['body'])
        body_content = body.text().replace(u'¶', '')
        sections.extend(generate_sections_from_pyquery(body))
    else:
        log.info('Unable to index content for: %s', filename)
    if 'title' in data:
        title = data['title']
        if title.startswith('<'):
            title = PyQuery(data['title']).text()
    else:
        log.info('Unable to index title for: %s', filename)

    return {'headers': process_headers(data, filename),
            'content': body_content, 'path': path,
            'title': title, 'sections': sections}
Пример #22
0
    def _parse_table(self, table):

        # Initialize table
        parsed_rows = []

        # Parse table
        qtable = PyQuery(table)

        # Get headers
        headers = self._get_headers(qtable)
        if not headers:
            return

        # Get rows
        rows = qtable.find("tr")

        # Loop over rows
        for row in rows:

            # Get columns
            qrow = PyQuery(row)
            cols = qrow.find("td").map(self._get_text)[:]

            # Parse column values
            for colidx in range(len(cols)):
                col = reduce(lambda x, y: re.sub(y[0], y[1], x), self._trans, cols[colidx])
                cols[colidx] = col

            # Append parsed columns
            if cols:
                parsed_rows.append(cols)

        return {"headers": headers, "data": parsed_rows}
Пример #23
0
 def list_page(self, response):
     result_content = {}
 
     content_iter = re.finditer(r"STK && STK.pageletM && STK.pageletM.view\((?P<content>\{.*?\})\)", response.content)
     for iter in content_iter:
         ok, content = safe_loads(iter.groupdict()['content'])
         if ok and "pl_weibo_direct" == content.get("pid"):
             result_content = content
             break
     else:
         return {}
     
     pyquery_doc = PyQuery(result_content["html"])
     pyquery_doc.make_links_absolute(response.url)
     
     items = []
     for item in pyquery_doc("DIV.feed_lists>DIV.WB_cardwrap>DIV").items():
         weibo_href = item("DIV.content>DIV.feed_from>A").attr.href
         if weibo_href:
             weibo_pics = []
             for pic in item("DIV.feed_content DIV.media_box IMG").items():
                 weibo_pics.append(pic.attr.src)
                 
             data = {
                 "content": item("DIV.feed_content P.comment_txt").text(),
                 "nickname": item("DIV.feed_content A.W_texta").attr.title,
                 "href": weibo_href,
                 "quote_nickname": item("DIV.feed_content DIV.comment DIV.comment_info A.W_texta").attr.title,
                 "quote_content": item("DIV.feed_content DIV.comment DIV.comment_info P.comment_txt").text(),
                 "pics": ''.join(weibo_pics)
             }
             self.crawl("data:,%s" % weibo_href, callback = self.detail_page, data_fetch_content=data)
Пример #24
0
    def update_forums(client, group, session):
        logging.info("Updating forums list for {}".format(group))
        query = Forum.get_forum_page(client, group.gid)
        reg = regex.compile(r"^forum\.php\?mod=forumdisplay&fid=(\d+)$")

        for row in query.find("table.fl_tb>tr"):
            sub_query = PyQuery(row)
            href = sub_query.find("td").eq(1).find("a").attr("href")
            if not href:
                continue

            fid = int(reg.findall(href)[0])

            name = sub_query.find("td").eq(1).find("h2>a").clone().children().remove().end().text()
            last_update = sub_query.find("td").eq(3).find("div>cite").clone().children().remove().end().text()
            last_update = dateparse(last_update)

            existence = session.query(Forum).filter(Forum.fid == fid)
            if existence.count() == 0:
                logging.info("<Forum(fid={})> not found, creating one".format(fid))
                forum = Forum(fid=fid, name=name, updated_at=last_update, group=group, fresh=False)
                session.add(forum)
            else:
                forum = existence.one()
                if forum.updated_at != last_update:
                    logging.info("{} found, stale: against {} ".format(forum, last_update))
                    forum.updated_at = last_update
                    forum.fresh = False
                    session.add(forum)
                else:
                    logging.info("{} found, fresh".format(forum))
    def _enhance_text(self):
        """
        Transforms a simplified text into a valid mail.template text.
        :return: mail.template text
        """
        self.ensure_one()
        # Parse and set back the keywords into raw template code
        html_text = PyQuery(self.simplified_text.replace('\n', ''))

        def sort_keywords(kw):
            # Replace first if/for-clauses, then var, then code
            index = kw.position
            if kw.type == 'if' or 'for' in kw.type:
                index += 2*len(self.body_html) * kw.nested_position
                # Take if and for in the appearing order in the text
                index -= kw.position
            elif kw.type == 'var':
                index += len(self.body_html)
            return index

        keywords = self.keyword_ids.sorted(sort_keywords, reverse=True)
        # Replace automatic-generated keywords
        for keyword in keywords:
            keyword_text = html_text('#' + keyword.html_id)
            keyword_text.replace_with(keyword.final_text)

        # Replace user added keywords
        template_text = html_text.html()
        for keyword in keywords.filtered(lambda k: k.type == 'code'):
            to_replace = u"[{}]".format(keyword.short_code)
            template_text = template_text.replace(to_replace, keyword.raw_code)
        final_text = PyQuery(BeautifulSoup(template_text).prettify())
        return final_text('body').html()
Пример #26
0
def get_meme_url(meme):
    gen = GENERATORS.get(meme)
    if gen:
        pq = PyQuery(url="http://memegenerator.net/%s" % gen[2])
        return pq.find('a img.large').attr('src')
    else:
        return None
Пример #27
0
    def __extract(self, html):
        pq = PyQuery(html).find("main#main #mainArea table")

        selector_ = "thead tr:eq(0) th"
        date_order = [PyQuery(v).text().split('\n')[0] for v in PyQuery(pq).find(selector_)][3:]
        result = {d: {} for d in date_order}

        index = 0
        total = len(PyQuery(pq).find("tbody tr"))
        while index < total:
            td = PyQuery(pq).find("tbody tr:eq(%d) td:eq(0)" % index)

            room_type = td.text().split()[0]
            rowspan = int(td.attr('rowspan'))

            for i in xrange(index, index + rowspan):
                row = PyQuery(pq).find("tbody tr:eq(%d)" % i)

                # smoking or not
                smoking = PyQuery(row).find("td.alC.alM > img").attr("alt")

                room = "%s (%s)" % (room_type, smoking)

                if row.hasClass('clubCardCell'):
                    member_type = 'member'
                else:
                    member_type = 'guest'

                for i, v in enumerate(self.__extract_price_remain(row)):
                    if room not in result[date_order[i]]:
                        result[date_order[i]][room] = {}
                    result[date_order[i]][room][member_type] = v

            index += rowspan
        return result
Пример #28
0
def get_bounds(scene_name):
    """Use Earth Explorer metadata to get bounds of a Scene"""
    url_code = get_metadata_code(scene_name)

    metadata = PyQuery(
        'http://earthexplorer.usgs.gov/fgdc/%s/%s/' % (url_code, scene_name)
        )
    metadata = metadata.text()[
        metadata.text().find('G-Ring_Latitude:'):
        metadata.text().find('\n  Keywords:')
        ]
    coords = (
        metadata.replace(' ', '')
        .replace('G-Ring_Latitude:', '')
        .replace('G-Ring_Longitude:', '')
        .split('\n')
        )
    coords = [float(coord) for coord in coords if coord != '']
    # create a list of lists with the coordinates
    coords = [coords[i:i + 2] for i in range(0, len(coords), 2)]
    # use reverse() to change [lat, lon] to [lon, lat]
    [coord.reverse() for coord in coords]
    # repeat the first coordinate on the end of the list
    if coords[0] != coords[-1]:
        coords.append(coords[0])
    return coords
Пример #29
0
def download(threadUrl):
    """
    """
    d = PyQuery(url=threadUrl, parser='soup')
    links = d('a[href^="job.php?action=download&aid="]')

    # 获取 verify 的值
    tmp = d('script:contains("var verifyhash =")').text()
    verify = re.search(r"var verifyhash = '(.*?)'", tmp).group(1)

    total = len(links)
    d.make_links_absolute()
    for i, e in enumerate(links.items(), start=1):
        filename = e.text()
        print('%s/%s %s' % (i, total, filename))

        if not os.path.exists(os.path.join(SAVE_PATH, filename)):
            params = urlencode(
                {'check': 1, 'verify': verify, 'nowtime': int(time.time() * 1000)})
            url = '%s?%s' % (e.attr['href'], params)

            print('  fetch: ' + url)
            downDoc = PyQuery(url, headers=headers)
            # 第0个是电信下载点,第1个是移动下载点
            downUrl = BASE_URL + downDoc('a[href^="remotedown.php"]').eq(1).attr('href')
            addToIDM(downUrl, SAVE_PATH, filename)
            time.sleep(1.5)

    wefiler_urls = checkWefiler(d)
    if wefiler_urls:
        print(wefiler_urls)
Пример #30
0
def get_forums():
    logging.info('Récupération des forums')
    if config.debug:
        progress = progressbar.NoProgressBar()
    else:
        progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()])

    d = PyQuery(url=config.rooturl + '/a-f1/', opener=fa_opener)
    
    save.forums = []
    levels = {}
    n = 1

    for i in progress([i for i in d.find("select option") if i.get("value", "-1") != "-1"]):
        id = i.get("value", "-1")
        logging.debug('Récupération: forum %s', id)
        title = re.search('(((\||\xa0)(\xa0\xa0\xa0))*)\|--([^<]+)', i.text).group(5)
        level = len(re.findall('(\||\xa0)\xa0\xa0\xa0', i.text))
        
        if level <= 0:
            parent = 0
        else:
            parent = levels[level-1]
        
        levels[level] = n
        
        d = PyQuery(url=config.rooturl+'/admin/index.forum?part=general&sub=general&mode=edit&fid=' + id + '&extended_admin=1&' + tid, opener=fa_opener)
        try:
            description = d("textarea").text()
        except:
            description = ""
        
        save.forums.append({'id': int(id[1:]), 'newid': n, 'type': id[0], 'parent': parent, 'title': title, 'description': description, 'parsed': False})
        n += 1
Пример #31
0
 def get_content(self, beta=False):
     url = self.url_list_betas if beta else self.url_list
     response = self.client.get(url)
     assert response.status_code == 200
     return PyQuery(response.content)
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  user_agent=None):
        refreshCursor = ''

        results = []
        resultsAux = []
        cookieJar = http.cookiejar.CookieJar()

        active = True

        while active:
            json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
                                               cookieJar, user_agent)
            if len(json['items_html'].strip()) == 0:
                break

            refreshCursor = json['min_position']
            tweets = PyQuery(json['items_html'])('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweetPQ = PyQuery(tweetHTML)
                tweet = models.Tweet()

                usernameTweet = tweetPQ(
                    "span.username.js-action-profile-name b").text()
                txt = re.sub(
                    r"\s+", " ",
                    tweetPQ("p.js-tweet-text").text().replace('# ',
                                                              '#').replace(
                                                                  '@ ', '@'))
                retweets = int(
                    tweetPQ(
                        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                favorites = int(
                    tweetPQ(
                        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                dateSec = int(
                    tweetPQ("small.time span.js-short-timestamp").attr(
                        "data-time"))
                id = tweetPQ.attr("data-tweet-id")
                permalink = tweetPQ.attr("data-permalink-path")
                user_id = int(
                    tweetPQ("a.js-user-profile-link").attr("data-user-id"))

                geo = ''
                geoSpan = tweetPQ('span.Tweet-geo')
                if len(geoSpan) > 0:
                    geo = geoSpan.attr('title')
                urls = []
                for link in tweetPQ("a"):
                    try:
                        urls.append((link.attrib["data-expanded-url"]))
                    except KeyError:
                        pass
                tweet.id = id
                tweet.permalink = 'https://twitter.com' + permalink
                tweet.username = usernameTweet

                tweet.text = txt
                tweet.date = datetime.datetime.fromtimestamp(dateSec)
                tweet.formatted_date = datetime.datetime.fromtimestamp(
                    dateSec).strftime("%a %b %d %X +0000 %Y")
                tweet.retweets = retweets
                tweet.favorites = favorites
                tweet.mentions = " ".join(
                    re.compile('(@\\w*)').findall(tweet.text))
                tweet.hashtags = " ".join(
                    re.compile('(#\\w*)').findall(tweet.text))
                tweet.geo = geo
                tweet.urls = ",".join(urls)
                tweet.author_id = user_id

                results.append(tweet)
                resultsAux.append(tweet)

                if receiveBuffer and len(resultsAux) >= bufferLength:
                    receiveBuffer(resultsAux)
                    resultsAux = []

                if tweetCriteria.maxTweets > 0 and len(
                        results) >= tweetCriteria.maxTweets:
                    print("active = False")
                    active = False
                    break

        if receiveBuffer and len(resultsAux) > 0:
            receiveBuffer(resultsAux)

        return results
Пример #33
0
 result = browser.page_source
 if companyname not in result:
     for cs in range(10):
         if companyname not in result:
             print("刷新页面")
             browser.refresh()
             browser.switch_to_alert().accept()
             time.sleep(4)
             html = browser.page_source
         else:
             break
 if companyname not in result:
     print("无该公司信息")
     browser.quit()
     break
 rq = PyQuery(result)
 alist = rq('.search_list_item')
 for i in alist:
     ad = PyQuery(i)
     t = ad('.f20').text().replace(" ", "")
     if ad('.f20').text() in companyname:
         xqurl = PyQuery(i).attr('href')
         break
 # with open('gjxy.html', 'r', encoding='utf') as f:
 #     html = f.read()
 #     f.close()
 headers = {
     "Accept-Language":
     "zh-CN,zh;q=0.9",
     "Accept-Encoding":
     "gzip, deflate",
option = webdriver.ChromeOptions()
option.add_argument('headless')
browser = webdriver.Chrome()

base_url1 = r'https://cise.jsu.edu.cn/'

base_url = r'https://cise.jsu.edu.cn/xygk/ggjs'
all_page = ['.htm', '/1.htm', '/2.htm']

for i in all_page:
    url = base_url + i
    browser.get(url)
    page = browser.page_source

    h = PyQuery(page)
    all_pe = h(
        'body > div.ntp > div > div > div.cla2 > div.cla22 > div.cla222 > div.gugan > div.gugan1 >a'
    )
    for i in all_pe.items():
        s = str(i.attr('href'))
        url1 = base_url1 + '/'.join(re.findall(
            r'\w+', s)[:-1]) + '.' + re.findall(r'\w+', s)[-1]
        r = browser.get(url1)
        k = PyQuery(browser.page_source)
        s = re.sub(r'<.+>', '', k('#vsb_content > div').text())
        if s == '':
            s = '暂无'
        print(s[0:4])
        print('====================')
        print(s)
Пример #35
0
 def get_content(self):
     response = self.client.get(self.url_list)
     assert response.status_code == 200
     return PyQuery(response.content)
Пример #36
0
def _zope_testbrowser_pyquery(self):
    from pyquery import PyQuery
    return PyQuery(
        self.contents.replace('xmlns="http://www.w3.org/1999/xhtml', ''))
Пример #37
0
def main():
    global output_dir, generate_pdf, use_cache, max_retries, error_timeout, interactive

    parser = argparse.ArgumentParser(
        description='A downloader for the digi4school open library')
    parser.add_argument('-s',
                        '--start',
                        type=float,
                        action='store',
                        dest='start',
                        default=0,
                        required=False,
                        help='Start the download at START percent')
    parser.add_argument('-e',
                        '--end',
                        type=float,
                        action='store',
                        dest='end',
                        default=100,
                        required=False,
                        help='Stop the download at END percent')
    parser.add_argument(
        '-o',
        '--output-directory',
        type=str,
        action='store',
        dest='output_dir',
        default=output_dir,
        required=False,
        help='The directory into which the books should be downloaded')
    parser.add_argument('-g',
                        '--generate-pdf',
                        action='store_true',
                        dest='generate_pdf',
                        required=False,
                        default=None,
                        help='Generate a pdf when all files are downloaded')
    parser.add_argument(
        '-ng',
        '--no-generate-pdf',
        action='store_false',
        dest='generate_pdf',
        required=False,
        default=None,
        help='Do NOT generate a pdf when all files are downloaded')
    parser.add_argument('-u',
                        '--use-cache',
                        action='store_true',
                        dest='use_cache',
                        required=False,
                        default=None,
                        help='Use already downloaded (cached) extra files')
    parser.add_argument('-nu',
                        '--no-use-cache',
                        action='store_false',
                        dest='use_cache',
                        required=False,
                        default=None,
                        help='Download extra files again')
    parser.add_argument('-i',
                        '--interactive',
                        action='store_true',
                        dest='interactive',
                        required=False,
                        default=None,
                        help='Prompt before starting download')
    parser.add_argument(
        '-ni',
        '--no-interactive',
        action='store_false',
        dest='interactive',
        required=False,
        default=None,
        help='Do not prompt before starting download, start right away')
    parser.add_argument(
        '-m',
        '--max-retries',
        type=int,
        action='store',
        dest='max_retries',
        default=max_retries,
        required=False,
        help='Retry downloading MAX_RETRIES times before skipping the book')
    parser.add_argument(
        '-t',
        '--error-timeout',
        type=float,
        action='store',
        dest='error_timeout',
        default=error_timeout,
        required=False,
        help='Wait ERROR_TIMEOUT seconds before retrying the download')

    args = parser.parse_args()
    start = args.start
    end_percent = args.end
    output_dir = args.output_dir
    max_retries = args.max_retries
    error_timeout = args.error_timeout

    if args.generate_pdf is not None:
        generate_pdf = args.generate_pdf
    if args.use_cache is not None:
        use_cache = args.use_cache
    if args.interactive is not None:
        interactive = args.interactive

    if len(output_dir) < 1:
        output_dir = input("Output directory: ")

    Path(output_dir).mkdir(parents=True, exist_ok=True)

    signal.signal(signal.SIGUSR1, handle_usr1)

    r = requests.post(base_url + openshelf_path,
                      get_all_books_payload,
                      headers=headers)
    r.encoding = encoding
    books = PyQuery(r.text)('#shelf').find('a')
    book_count = len(books)

    print()
    print(str(book_count) + ' books')
    print("Output directory: " + output_dir)

    if interactive:
        input('Press [ENTER] to start the download')

    if start > 0:
        print("\nSkipping first %.2f %%..." % start)

    i = 0
    for book in books.items():
        if stop:
            stop_program()

        i += 1
        percent = (i * 100 / book_count)

        if percent <= start:
            continue

        if percent > end_percent:
            print("Stopping at %.2f %% ..." % end_percent)
            break

        book_id = book.attr['data-id']
        title = book('h1').text().replace('/', '-')
        current_path = os.path.join(output_dir, book_id)
        Path(current_path).mkdir(parents=True, exist_ok=True)
        print('\n\nDownloading book "' + book_id + "\" (%.2f %%)" % percent)

        if generate_pdf:
            if os.path.isfile(os.path.join(current_path, title + '.pdf')):
                print('Found existing PDF skipping...')
                continue
        else:
            if os.path.isfile(os.path.join(current_path, 'generate-pdf.sh')):
                print('Found PDF generation script, skipping...')
                continue

        # Writing info about book
        with open(os.path.join(current_path, 'info.txt'),
                  'w',
                  encoding=encoding) as f:
            f.writelines(
                os.linesep.join([
                    u"Thumbnail: %s" % str(book('img').attr['src']),
                    u"Title: %s" % str(book('h1').text()),
                    u"Publisher: %s" % str(book('h2').text())
                ]))

        if stop:
            stop_program()

        count = 0
        end = 1
        orig_book_id = book_id
        while count < end:
            book_id = orig_book_id
            count += 1
            try:
                cookie_request = send_form(
                    send_form(
                        requests.get(base_url + token_path + book_id,
                                     headers=headers)))
                cookie_str = ''
                for cookie in cookie_request.cookies:
                    if cookie.name == target_cookie_name:
                        cookie_str = cookie.name + '=' + cookie.value + '; '

                if len(cookie_str) < 1:
                    end = handle_error('ERROR: Cookie not found!', count, end)
                    continue

                if stop:
                    stop_program()

                location = cookie_request.headers['Location']
                if len(location) < 1:
                    location = book_base_url + book_path + book_id + '/'
                    print(
                        'WARNING: Can\'t find book location in header, assuming '
                        + location)

                r = requests.get(location,
                                 headers=headers,
                                 cookies=cookie_request.cookies,
                                 allow_redirects=False)
                r.encoding = encoding
                if r.status_code == 200:
                    if 'IDRViewer' not in r.text and '<div id="content">' in r.text:
                        print('Found extra content!')
                        print('Downloading extra content...')

                        book_content = PyQuery(r.text)("#content")
                        extra_path = os.path.join(current_path, 'extra')
                        extra_books = []
                        Path(extra_path).mkdir(parents=True, exist_ok=True)

                        # Download root files
                        for node in book_content(
                                'a:not(.sub):not(.directory)').items():
                            if not str(node.attr['href']).startswith('1/'):
                                thumbnail_location = str(
                                    node('img').attr['src'])
                                if thumbnail_location.endswith(
                                        'thumbnails/1.jpg'
                                ) and not thumbnail_location.startswith(
                                        'http'):
                                    extra_books.append([
                                        str(node.attr['href']).replace(
                                            '/index.html', ''),
                                        node('h1').text().replace('/', '-')
                                    ])
                                else:
                                    download_content(node, location,
                                                     cookie_request.cookies,
                                                     extra_path)

                        if stop:
                            stop_program()

                        # Download content of all root directories
                        for root_dir_node in book_content(
                                'a:not(.sub).directory').items():
                            root_dir = os.path.join(
                                extra_path,
                                root_dir_node('h1').text().replace('/', '-'))
                            Path(root_dir).mkdir(parents=True, exist_ok=True)
                            download_content_from_directory(
                                book_content, root_dir_node.attr['id'],
                                location, cookie_request.cookies, root_dir)

                        print('Checking book_id ' + str(book_id) + '/1...')
                        r = requests.get(location + "1/",
                                         headers=headers,
                                         cookies=cookie_request.cookies)

                        if 'IDRViewer' not in r.text:
                            print(
                                'WARNING: Book "' + book_id +
                                '/1" looks weird (contains no "IDRViewer"! Checking extra books...'
                            )
                            if len(extra_books) == 1:

                                # COMPATIBILITY
                                if os.path.exists(
                                        os.path.join(extra_path,
                                                     extra_books[0][1])):
                                    print(
                                        'INFO: For compatibility reasons, normal book download will be skipped!'
                                    )
                                    book_id = None
                                else:
                                    book_id = orig_book_id + '/' + extra_books[
                                        0][0]
                                    print(
                                        'Found one extra book, setting book_id to '
                                        + str(book_id))
                                    r = requests.get(
                                        location + extra_books[0][0] + '/',
                                        headers=headers,
                                        cookies=cookie_request.cookies)
                                    extra_books.pop(0)
                            elif len(extra_books) > 1:
                                print(
                                    'WARNING: Found more than one extra book, skipping normal book download!'
                                )
                                book_id = None
                            else:
                                print(
                                    'WARNING: Found no extra book, skipping normal book download!'
                                )
                                book_id = None
                        else:
                            book_id += '/1'
                            print('Setting book_id to ' + str(book_id))
                            r = requests.get(location + '1/',
                                             headers=headers,
                                             cookies=cookie_request.cookies)

                        for extra_book in extra_books:
                            if stop:
                                stop_program()

                            print('Downloading extra book "' + extra_book[0] +
                                  '"...')
                            er = requests.get(location + extra_book[0] + '/',
                                              headers=headers,
                                              cookies=cookie_request.cookies)
                            if 'IDRViewer' not in er.text:
                                print(
                                    'WARNING: Extra book "' + extra_book[0] +
                                    '" looks weird (contains no "IDRViewer"! Skipping...'
                                )
                                continue
                            if download_book(
                                    extra_book[1], orig_book_id + '/' +
                                    extra_book[0], cookie_str,
                                    os.path.join(extra_path, extra_book[1]),
                                    er).returncode != 0:
                                end = handle_error(
                                    'ERROR: Error running digiRipper!', count,
                                    end)
                                raise ConnectionError(
                                    'Error downloading extra book "' +
                                    extra_book[0] + '"!')
                        print('Downloaded extra content to "' + extra_path +
                              '"')
                else:
                    print(
                        'WARNING: Got wrong response code from book page, skipping check for extra material!'
                    )

                if stop:
                    stop_program()

                if book_id is not None:
                    if download_book(title, book_id, cookie_str, current_path,
                                     r).returncode != 0:
                        end = handle_error('ERROR: Error running digiRipper!',
                                           count, end)
                        continue
                else:
                    Path(os.path.join(current_path, 'generate-pdf.sh')).touch()

            except Exception as e:
                end = handle_error('ERROR: An exception was thrown: ' + str(e),
                                   count, end)
                continue
Пример #38
0
    def detail(self, url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('.product-details')
            pdata = self.get_pdata(pqhtml)

            # print json.dumps(pdata)
            print area.outerHtml().encode('utf-8')
            # exit()

            #下架
            # isNoSize,官网配置无size选项,shippingRestrictionsVisible,官网配置限制配送商品.
            # pdata['shippingRestrictions']['shippingRestrictionsVisible']:
            # 从pdata中读取数据,下架了即都是库存为0

            detail = dict()

            #品牌
            brand = pdata['brandName']
            detail['brand'] = brand

            #名称
            detail['name'] = pdata['name']

            #货币
            currency = pdata['price']['currency']
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = pdata['price']['current'], (
                pdata['price']['rrp'] or pdata['price']['previous'])
            detail['price'] = price
            detail['listPrice'] = listPrice or price

            #颜色
            color, colorId, img = self.get_color(pdata)
            detail['color'] = color
            detail['colorId'] = colorId

            #图片集,每个加参数,宽度1000(大图)
            imgs = map(
                lambda x: x + '?wid=1000',
                filter(lambda x: x, [Dic['url'] for Dic in pdata['images']]))
            detail['img'] = img
            detail['imgs'] = imgs

            #产品ID
            productId = pdata['id']
            detail['productId'] = productId

            #规格
            detail['sizes'] = self.get_sizes(pdata)

            #描述
            detail['descr'] = area('.product-description').text()

            #详细
            detail['detail'] = area('.product-details').text()

            #品牌描述
            detail['brandDescr'] = area('.brand-description').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            #返回的IP和端口
            if resp.raw._original_response.peer:
                detail['ip_port'] = ':'.join(
                    map(lambda x: str(x), resp.raw._original_response.peer))

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            try:
                return self.detail_old(url)
            except Exception, e:
                raise
Пример #39
0
def downLoad(href):
    r = requests.get(href, getHeaders())
    r.encoding = 'utf8'
    q = PyQuery(r.text)
    #该链接下的图片总数
    total = int(q('#page > a:nth-child(9)').text())
    #该链接下的标题,用来创建文件夹
    title = q('body > div.main > div.article > h2').text()
    href = q('div.content > a> img').attr.src[0:-5]
    dirName = u"【{}P】{}".format(total, title)
    # 定义要创建的目录
    mkpath = "c:\\mymeizi\\" + dirName + "\\"
    # print mkpath
    # 调用函数
    # print  href
    if makedir(mkpath):
        print mkpath + "目录已创建"
        for x in range(1, total + 1):
            try:
                imgurl = href + str(x) + ".jpg"
                # urllib.urlretrieve(imgrul, mkpath + "/%s.jpg" % x)
                # opener = urllib2.build_opener()
                # opener.addheaders = [('Host','img.mmjpg.com'),
                #                              ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                #                                             'Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400'),
                #                              ('Referer','http://www.mmjpg.com/mm')]
                # urllib2.install_opener(opener)
                # urllib2.urlretrieve(imgurl, mkpath + "/%s.jpg" % x)
                # os.chdir(r"D:")
                # header = {
                #     'Host': 'img.mmjpg.com',
                #     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) '
                #                   'Chrome/59.0.3071.115 Safari/537.36',
                #     'Referer': 'http://www.mmjpg.com/mm/1188/3'
                # }
                # request = urllib2.Request(imgurl, None, header)
                # response = urllib2.urlopen(request)
                # f = open(name, 'wb')
                # f.write(response.read())
                # f.close()
                # print(imgurl)
                # print href + str(x) + ".jpg"
                # print imgurl
                headers = {
                    'Accept':
                    'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, * / *;q = 0.8',
                    'Accept - Encoding':
                    'gzip, deflate, sdch',
                    'Accept - Language':
                    'zh - CN, zh;q = 0.8',
                    'Cache - Control':
                    'max - age = 0',
                    'Connection':
                    'keep - alive',
                    'DNT':
                    '1',
                    'Host':
                    'img.mmjpg.com',
                    'Referer':
                    'http: // www.mmjpg.com',
                    'Upgrade - Insecure - Requests':
                    '1',
                    'User - Agent':
                    'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 53.0.2785.104 Safari / 537.36 Core / 1.53.3427.400 QQBrowser / 9.6.12513.400',
                }
                print imgurl
                r = requests.get(imgurl, headers=headers)
                print r.status_code
                with open(mkpath + "/%s.jpg" % x, "wb") as code:
                    code.write(r.content)
                # data = urllib.urlopen(imgurl).read()
                # f = file(mkpath + "/%s.jpg" % x, "wb")
                # f.write(data)
                # f.close()
            except Exception, e:
                print "出了一点小小的错误"
                continue
Пример #40
0
    def detail_old(self, url):
        try:

            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            #错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            product = self.get_product_cfg_old(pqhtml)

            #下架
            if product is None or product['AvailableSkus'] == 0:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            price, oldPrice = self.get_all_price_old(pqhtml, product)

            colors_tmp = self.get_colors_old(pqhtml)

            detail = dict()

            detail['name'] = product['ProductName']

            detail['brand'] = product['ProductBrand']

            detail['currencySymbol'] = tool.get_unit(
                product['ProductCurrency'])

            detail['currency'] = product['ProductCurrency']

            detail['descr'] = product['ProductDescription'].replace(
                '&nbsp;', '')

            detail['productId'] = product['ProductCode']

            detail['price'] = price

            detail['listPrice'] = oldPrice

            detail['keys'] = [color['key'] for color in colors_tmp]

            detail['color'] = dict([(color['key'], color['name'])
                                    for color in colors_tmp])

            detail['colorId'] = dict([(color['key'], color['value'])
                                      for color in colors_tmp])

            #图片信息
            imgs_tmp = self.get_imgs_old(pqhtml)
            detail['imgs'] = imgs_tmp
            detail['img'] = dict([(name, links[0])
                                  for name, links in imgs_tmp.items()])

            detail['sizes'] = self.get_size_old(pqhtml)

            detail['url'] = url

            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            detail['backUrl'] = resp.url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
Пример #41
0
    def getTweets(tweetCriteria,
                  receiveBuffer=None,
                  bufferLength=100,
                  proxy=None):
        refreshCursor = ''

        results = []
        resultsAux = []
        cookieJar = cookielib.CookieJar()

        if hasattr(tweetCriteria, 'username') and (
                tweetCriteria.username.startswith("\'")
                or tweetCriteria.username.startswith("\"")) and (
                    tweetCriteria.username.endswith("\'")
                    or tweetCriteria.username.endswith("\"")):
            tweetCriteria.username = tweetCriteria.username[1:-1]

        active = True
        cnt, i = 0, 0
        while active:
            json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
                                               cookieJar, proxy)
            if len(json['items_html'].strip()) == 0:
                break

            refreshCursor = json['min_position']
            scrapedTweets = PyQuery(json['items_html'])
            #Remove incomplete tweets withheld by Twitter Guidelines
            scrapedTweets.remove('div.withheld-tweet')
            tweets = scrapedTweets('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                i += 1
                tweetPQ = PyQuery(tweetHTML)
                tweet = models.Tweet()
                print "tweeeeeeeeeet", tweetHTML, "----------", tweetPQ(
                    "p.js-tweet-text"), "-----------", tweetPQ(
                        "p.js-tweet-text").attr("class")
                a = [img for img in tweetPQ("p.js-tweet-text").items('img')]
                emojis = []
                for img in a:
                    if img.attr("class") == "Emoji Emoji--forText":
                        emojis.append(img.attr("aria-label"))
                if len(a) != 0: cnt += 1
                print a, "---------", cnt, i, len(emojis)
                usernameTweet = tweetPQ("span:first.username.u-dir b").text()
                txt = re.sub(
                    r"\s+", " ",
                    tweetPQ("p.js-tweet-text").text().replace('# ',
                                                              '#').replace(
                                                                  '@ ', '@'))
                retweets = int(
                    tweetPQ(
                        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                favorites = int(
                    tweetPQ(
                        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                    ).attr("data-tweet-stat-count").replace(",", ""))
                dateSec = int(
                    tweetPQ("small.time span.js-short-timestamp").attr(
                        "data-time"))
                id = tweetPQ.attr("data-tweet-id")
                permalink = tweetPQ.attr("data-permalink-path")

                geo = ''
                geoSpan = tweetPQ('span.Tweet-geo')
                if len(geoSpan) > 0:
                    geo = geoSpan.attr('title')

                tweet.id = id
                tweet.permalink = 'https://twitter.com' + permalink
                tweet.username = usernameTweet
                tweet.text = txt
                tweet.date = datetime.datetime.fromtimestamp(dateSec)
                tweet.retweets = retweets
                tweet.favorites = favorites
                tweet.mentions = " ".join(
                    re.compile('(@\\w*)').findall(tweet.text))
                tweet.hashtags = " ".join(
                    re.compile('(#\\w*)').findall(tweet.text))
                tweet.geo = geo
                tweet.emoji = emojis
                if (len(emojis) > 0):
                    results.append(tweet)
                resultsAux.append(tweet)

                if receiveBuffer and len(resultsAux) >= bufferLength:
                    receiveBuffer(resultsAux)
                    resultsAux = []

                if tweetCriteria.maxTweets > 0 and len(
                        results) >= tweetCriteria.maxTweets:
                    active = False
                    break

        if receiveBuffer and len(resultsAux) > 0:
            receiveBuffer(resultsAux)

        return results
	def getTweets(tweetCriteria, receiveBuffer = None, bufferLength = 100):
		refreshCursor = ''
	
		results = []
		resultsAux = []
		cookieJar = http.cookiejar.CookieJar()

		active = True

		while active:
			json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar)
			if len(json['items_html'].strip()) == 0:
				break

			refreshCursor = json['min_position']			
			tweets = PyQuery(json['items_html'])('div.js-stream-tweet')
			
			if len(tweets) == 0:
				break
			
			for tweetHTML in tweets:
				tweetPQ = PyQuery(tweetHTML)
				tweet = Tweet.Tweet()
				
				usernameTweet = tweetPQ("span.username.js-action-profile-name b").text();
				txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@'));
				retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
				favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
				dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time"));
				id = tweetPQ.attr("data-tweet-id");
				permalink = tweetPQ.attr("data-permalink-path");
				
				geo = ''
				geoSpan = tweetPQ('span.Tweet-geo')
				if len(geoSpan) > 0:
					geo = geoSpan.attr('title')
				
				tweet.id = id
				tweet.permalink = 'https://twitter.com' + permalink
				tweet.username = usernameTweet
				tweet.text = txt
				tweet.date = datetime.datetime.fromtimestamp(dateSec)
				tweet.retweets = retweets
				tweet.favorites = favorites
				tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text))
				tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text))
				tweet.geo = geo
				
				results.append(tweet)
				resultsAux.append(tweet)
				
				if receiveBuffer and len(resultsAux) >= bufferLength:
					receiveBuffer(resultsAux)
					resultsAux = []
				
				if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets:
					active = False
					break
					
		
		if receiveBuffer and len(resultsAux) > 0:
			receiveBuffer(resultsAux)
		
		return results
    def dispatch(self):
        if self.annual_item_list is None:
            raise IndexError("未抓取到相关网页,或者抓取网页失败")

        if len(self.annual_item_list) <= 0:
            return {}

        dict_annual = {}
        for lst_annual in self.annual_item_list:
            url = lst_annual.get('url')
            if 'entinfo' in url:
                dict_annual['baseinfo'] = lst_annual.get('text')
            elif 'WebsiteInfo' in url:
                dict_annual['websiteinfo'] = util.json_loads(
                    lst_annual.get('text'))
            elif 'subcapitalInfo' in url:
                dict_annual['subcapital'] = util.json_loads(
                    lst_annual.get('text'))
            elif 'forinvestMentInfo' in url:
                dict_annual['forinvestment'] = util.json_loads(
                    lst_annual.get('text'))
            elif 'GuaranteeInfo' in url:
                dict_annual['forguaranteeinfo'] = util.json_loads(
                    lst_annual.get('text'))
            elif 'alterStockInfo' in url:
                dict_annual['alterstockinfo'] = util.json_loads(
                    lst_annual.get('text'))
            elif 'updateinfo' in url:
                dict_annual['updateinfo'] = util.json_loads(
                    lst_annual.get('text'))

        # 基本信息
        base_info = dict_annual.get('baseinfo')
        if base_info is not None:
            info = PyQuery(base_info, parser='html').find('.encounter-info')
            annual_base_info = self.zj_get_annual_base_info(info)
            self.annual_info_dict.update(annual_base_info)

        # 网站或网店信息
        web_info = dict_annual.get('websiteinfo')
        if web_info is not None:
            lst_websites = self.zj_get_annual_web_site_info(web_info)
            self.annual_info_dict[AnnualReports.WEBSITES] = lst_websites

        # 股东出资信息
        share_hold_info = dict_annual.get('subcapital')
        if share_hold_info is not None:
            lst_share_hold = self.zj_get_annual_share_hold_info(
                share_hold_info)
            self.annual_info_dict[
                AnnualReports.SHAREHOLDER_INFORMATION] = lst_share_hold

        # 对外投资
        inv_info = dict_annual.get('forinvestment')
        if inv_info is not None:
            lst_inv = self.zj_get_annual_inv_info(inv_info)
            self.annual_info_dict[AnnualReports.INVESTED_COMPANIES] = lst_inv

        # 年报 企业资产状况信息
        base_info = dict_annual.get('baseinfo')
        if base_info is not None:
            tds = PyQuery(base_info, parser='html').find('.table-zichan').not_(
                '.table-td-pd').find('td')
            asset_model = self.zj_get_annual_asset_info(tds)
            self.annual_info_dict[
                AnnualReports.
                ENTERPRISE_ASSET_STATUS_INFORMATION] = asset_model

        # 对外担保
        out_guaranty_info = dict_annual.get('forguaranteeinfo')
        if out_guaranty_info is not None:
            lst_out_guaranty = self.zj_get_annual_out_guarantee_info(
                out_guaranty_info)
            self.annual_info_dict[
                AnnualReports.OUT_GUARANTEE_INFO] = lst_out_guaranty

        # 股权变更
        edit_shareholding_change_info = dict_annual.get('alterstockinfo')
        if edit_shareholding_change_info is not None:
            lst_edit_shareholding_change = self.zj_get_annual_edit_shareholding_change(
                edit_shareholding_change_info)
            self.annual_info_dict[
                AnnualReports.
                EDIT_SHAREHOLDING_CHANGE_INFOS] = lst_edit_shareholding_change

        # 修改记录
        edit_change_info = dict_annual.get('updateinfo')
        if edit_change_info is not None:
            lst_edit_change = self.zj_get_annual_edit_change(edit_change_info)
            self.annual_info_dict[
                AnnualReports.EDIT_CHANGE_INFOS] = lst_edit_change
Пример #44
0
    def collectTweets(tweetCriteria,
                      receiveBuffer=None,
                      bufferLength=100,
                      proxy=None,
                      debug=False):
        results = []
        resultsAux = []
        cookieJar = http.cookiejar.CookieJar()
        user_agent = random.choice(TweetManager.user_agents)

        all_usernames = []
        usernames_per_batch = 20

        if hasattr(tweetCriteria, 'username'):
            if type(tweetCriteria.username) == str or not hasattr(
                    tweetCriteria.username, '__iter__'):
                tweetCriteria.username = [tweetCriteria.username]

            usernames_ = [u.lstrip('@') for u in tweetCriteria.username if u]
            all_usernames = sorted({u.lower() for u in usernames_ if u})
            n_usernames = len(all_usernames)
            n_batches = n_usernames // usernames_per_batch + (
                n_usernames % usernames_per_batch > 0)
        else:
            n_batches = 1

        for batch in range(n_batches):
            refreshCursor = ''
            batch_count_results = 0

            if all_usernames:
                tweetCriteria.username = all_usernames[
                    batch * usernames_per_batch:batch * usernames_per_batch +
                    usernames_per_batch]

            active = True
            while active:
                json = TweetManager.getJsonResponse(tweetCriteria,
                                                    refreshCursor,
                                                    cookieJar,
                                                    proxy,
                                                    user_agent,
                                                    debug=debug)
                if len(json['items_html'].strip()) == 0:
                    break

                refreshCursor = json['min_position']
                scrapedTweets = PyQuery(json['items_html'])
                scrapedTweets.remove('div.withheld-tweet')
                tweets = scrapedTweets('div.js-stream-tweet')

                if len(tweets) == 0:
                    break

                for tweetHTML in tweets:
                    tweetPQ = PyQuery(tweetHTML)
                    tweet = models.Tweet()

                    usernames = tweetPQ("span.username.u-dir b").text().split()
                    if not len(usernames):
                        continue

                    tweet.username = usernames[0]
                    tweet.to = usernames[1] if len(usernames) >= 2 else None
                    tweet.text = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text()) \
                        .replace('# ', '#').replace('@ ', '@').replace('$ ', '$')
                    tweet.retweets = int(
                        tweetPQ(
                            "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    tweet.favorites = int(
                        tweetPQ(
                            "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    tweet.replies = int(
                        tweetPQ(
                            "span.ProfileTweet-action--reply span.ProfileTweet-actionCount"
                        ).attr("data-tweet-stat-count").replace(",", ""))
                    tweet.id = tweetPQ.attr("data-tweet-id")
                    tweet.permalink = 'https://twitter.com' + tweetPQ.attr(
                        "data-permalink-path")
                    tweet.author_id = int(
                        tweetPQ("a.js-user-profile-link").attr("data-user-id"))

                    dateSec = int(
                        tweetPQ("small.time span.js-short-timestamp").attr(
                            "data-time"))
                    tweet.date = datetime.datetime.fromtimestamp(
                        dateSec, tz=datetime.timezone.utc)
                    tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec, tz=datetime.timezone.utc) \
                        .strftime("%a %b %d %X +0000 %Y")
                    tweet.mentions = " ".join(
                        re.compile('(@\\w*)').findall(tweet.text))
                    tweet.hashtags = " ".join(
                        re.compile('(#\\w*)').findall(tweet.text))

                    urls = []
                    for link in tweetPQ("a"):
                        try:
                            urls.append((link.attrib["data-expanded-url"]))
                        except KeyError:
                            pass

                    tweet.urls = ",".join(urls)

                    results.append(tweet)
                    resultsAux.append(tweet)

                    if receiveBuffer and len(resultsAux) >= bufferLength:
                        receiveBuffer(resultsAux)
                        resultsAux = []

                    batch_count_results += 1
                    if tweetCriteria.maxTweets > 0 and batch_count_results >= tweetCriteria.maxTweets:
                        active = False
                        break

            if receiveBuffer and len(resultsAux) > 0:
                receiveBuffer(resultsAux)
                resultsAux = []

        return results
 # 通过requests请求连接,并返回对应结果
import requests
response = requests.get('http://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA')
print(response)

# 通过Pyquery及返回的response提取内容
from pyquery import PyQuery
document = PyQuery(response.text)
content = document('#js_content').text()

# 通过stats_word的分词程序,提取词频为前100的分词,并转换为str格式
import stats_word as sw
list_a = sw.stats_text(content,100)
a = ''
for i in range(len(list_a)) :
    a = a + str(list_a[i][0]) + ' ' + str(list_a[i][1]) + ', '
print(a)

# 通过yagmail将内容发送给指定邮箱,并通过getpass保护密码
import getpass
import yagmail
sender = input('输入发件人邮箱:')
password = getpass.getpass('输入发件人密码:')
recipients = input('输入收件人邮箱:')
# yagmail.register(sender, password)
yagmail = yagmail.SMTP(sender, password, host='smtp.163.com')
yagmail.send(recipients,'19100205 lihaotian007', a)
Пример #46
0
    def test_theme_license_link(self):
        s = render('{{ license_link(lic) }}', {'lic': amo.LICENSE_COPYRIGHT})

        ul = PyQuery(s)('.license')
        assert ul.find('.icon').length == 1
        assert ul.find('.icon.copyr').length == 1

        text = ul.find('.text')
        assert text.find('a').length == 0
        assert text.text() == 'All Rights Reserved'

        s = render('{{ license_link(lic) }}', {'lic': amo.LICENSE_CC_BY_NC_SA})

        ul = PyQuery(s)('.license')
        assert ul.find('.icon').length == 3
        assert ul.find('.icon.cc-attrib').length == 1
        assert ul.find('.icon.cc-noncom').length == 1
        assert ul.find('.icon.cc-share').length == 1

        link = ul.find('.text a')
        assert link.find('a').length == 0
        assert link.text() == 'Some rights reserved'
        assert link.attr('href') == amo.LICENSE_CC_BY_NC_SA.url
Пример #47
0
 def _slice(res: str, index: int = 1) -> GoogleResponse:
     utf8_parser = HTMLParser(encoding="utf-8")
     d = PyQuery(fromstring(res, parser=utf8_parser))
     data = d.find(".g")
     pages = list(d.find("td").items())[1:-1]
     return GoogleResponse(data, pages, index)
    def get_xml_shareholder_info(self, share_xml):
        shareholder_info_dict = {}
        str_index = share_xml.find('<data>')
        if str_index < 0:
            return shareholder_info_dict

        shareholder_data = PyQuery(
            share_xml, parser='xml').find('data').find('data').items()
        lst_shareholder = []
        for data in shareholder_data:
            share_model = {
                GsModel.ShareholderInformation.SHAREHOLDER_NAME:
                data.find('inv').text(),
                GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT:
                util.get_amount_with_unit(data.find('liSubConAm').text()),
                GsModel.ShareholderInformation.PAIED_AMOUNT:
                util.get_amount_with_unit(data.find('liAcConAm').text()),
            }

            lst_sub = []
            sub_data = data.find('imInvprodetailList').find(
                'imInvprodetailList').items()
            for sub_detail in sub_data:
                sub_dict = {
                    GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TYPE:
                    sub_detail.find('conFormCN').text(),
                    GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT:
                    util.get_amount_with_unit(
                        sub_detail.find('subConAm').text()),
                    GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TIME:
                    sub_detail.find('conDate').text(),
                    GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_PUBLISH_TIME:
                    sub_detail.find('publicDate').text()
                }
                sub_dict = self.replace_none(sub_dict)
                lst_sub.append(sub_dict)
            share_model[
                GsModel.ShareholderInformation.SUBSCRIPTION_DETAIL] = lst_sub

            lst_paid = []
            paid_data = data.find('imInvactdetailList').find(
                'imInvactdetailList').items()
            for paid_detail in paid_data:
                paid_dict = {
                    GsModel.ShareholderInformation.PaiedDetail.PAIED_TYPE:
                    paid_detail.find('acConFormCn').text(),
                    GsModel.ShareholderInformation.PaiedDetail.PAIED_AMOUNT:
                    util.get_amount_with_unit(
                        paid_detail.find('acConAm').text()),
                    GsModel.ShareholderInformation.PaiedDetail.PAIED_TIME:
                    paid_detail.find('conDate').text(),
                    GsModel.ShareholderInformation.PaiedDetail.PAIED_PUBLISH_TIME:
                    paid_detail.find('publicDate').text(),
                }
                paid_dict = self.replace_none(paid_dict)
                lst_paid.append(paid_dict)
            share_model[GsModel.ShareholderInformation.PAIED_DETAIL] = lst_paid

            share_model = self.replace_none(share_model)
            lst_shareholder.append(share_model)
        shareholder_info_dict[
            GsModel.SHAREHOLDER_INFORMATION] = lst_shareholder
        return shareholder_info_dict
Пример #49
0
import requests
from pyquery import PyQuery
import pickle
from helper.move import *

moves = []

for x in range(1, 8):
    data = requests.get("https://pokemondb.net/move/generation/" + str(x))
    src = PyQuery(data.text)

    trs = src.find('.ent-name')
    length = len(moves)
    i = length
    for tr in trs:
        moves.append([])
        moves[i].append(tr.text)
        i += 1

    trs = src.find('.type-icon')
    i = length
    for tr in trs:
        moves[i].append(tr.text)
        i += 1

    trs = src.find('td:nth-child(3)')
    i = length
    for tr in trs:
        if tr.attrib["data-sort-value"] == "special":
            moves[i].append(1)
        elif tr.attrib["data-sort-value"] == "physical":
Пример #50
0
    'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    'accept-language': "en-US,en;q=0.9",
    'cache-control': "no-cache",
    'content-type': "application/x-www-form-urlencoded",
    'origin': "https://www.sgpbusiness.com",
    #'cookie': "__cfduid=dbd33060d5ca8c09500f853a22ccafcc11526030630; _ga=GA1.2.1937662851.1526030632; _gid=GA1.2.412763831.1526710953; sgpbizsess=a2504243407a7eb0a29ef82e447a5b0a43f37d1a; _gat=1",
    'referer': "https://www.sgpbusiness.com/",
    'upgrade-insecure-requests': "1",
    'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
    }
f = open('sgp_out.txt','a')
li = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
for i in ['b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']:
    for j in li:
        for k in li:
            payload = "search_val={}".format(i+j+k)
            response = requests.request("POST", url, data=payload, headers=headers)
            
            pq = PyQuery(response.text)
            
            
            links = pq('.list-group a')
            print (i+j+k+' - '+str(len(links)))
            for l in links:
                #print (l.attrib['href'])
                f.write(l.attrib['href']+'\n')
                f.flush()
                
            #time.sleep(2)

Пример #51
0
def get_brand(code):
    q = PyQuery("https://kabutan.jp/stock/?code=7203")
    sector = q.find('#stockinfo_i2 > div > a')[0].text
    print(sector)
    print(code)
Пример #52
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                 parser=parser)
     for element in d('link'):
         e = PyQuery(element)
         href = e.attr('href')
         if href:
             if href.find(domain) > -1:
                 new_href = href.split(domain)[-1]
                 new_href = '{}{}'.format(target_domain, new_href)
                 e.attr('href', new_href)
                 print "\t", "fixed link ", href, "=> ", new_href
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if href:
             if href.find(domain) > -1:
                 new_href = href.split(domain)[-1]
                 e.attr('href', new_href)
                 print "\t", "Fixed ", href, "=> ", new_href
         if href and not abs_url_regex.search(href):
             new_href = re.sub(r'rss/index\.html$', 'rss/index.rss',
                               href)
             new_href = re.sub(r'/index\.html$', '/', new_href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #53
0
from pyquery import PyQuery

doc = PyQuery('<html>Hello</html>')
res = doc('html').text()
print(doc)
print(res)
Пример #54
0
 def get_content(self):
     url = reverse('addons.versions', args=[self.addon.slug])
     return PyQuery(self.client.get(url).content)
Пример #55
0
 def test_eula_with_contrib_roadblock(self):
     url = reverse('addons.eula', args=[11730, 53612])
     response = self.client.get(url, follow=True)
     doc = PyQuery(response.content)
     assert doc('[data-search]').attr('class') == 'install '
Пример #56
0
# coding=utf8
from pyquery import PyQuery
import requests
import sys
import yagmail
import getpass

sys.path.append(r"C:\Study\Programming\Python\Python_Data_Analysis")

from d11.stats_word import stats_text

content_url = "https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA"
html_code = requests.get(content_url).text
document = PyQuery(html_code)
content = document("#js_content").text().replace("\n", "")

try:
    en_result, cn_result = stats_text("", content)
    # print(cn_result)
    smtp_host = "smtp.sina.com"
    sender = input("Please enter the sender's email address: ")
    password = getpass.getpass("Please enter the sender's email password: "******"Please enter the recipient's email address: ")

    yagmail.SMTP(user=sender, password=password,
                 host=smtp_host).send(recipient, "Cutted words",
                                      str(cn_result))
except ValueError as e:
    print("Exception catched.")
    print(e)
Пример #57
0
 def index_page(self):
     r = self.client.get("/")
     pq = PyQuery(r.content)
     link_elements = pq(".toctree-wrapper a.internal")
     self.toc_urls = [l.attrib["href"] for l in link_elements]
Пример #58
0
 def render(self, **kwargs):
     return PyQuery(install_button(self.context, self.addon, **kwargs))
Пример #59
0
#!/usr/bin/env python3

# date: 2019.08.12
# https://stackoverflow.com/questions/57454154/pyquery-wont-return-elements-on-a-page
# https://github.com/gawel/pyquery/issues/199

import requests
from pyquery import PyQuery

url = "http://www.floridaleagueofcities.com/widgets/cityofficials?CityID=101"
page = requests.get(url)

pq = PyQuery(page.text, parser="html")
for item in pq('li p'):
    print(item.text)

Пример #60
0
 def load_page(self, url=None):
     url = random.choice(self.toc_urls)
     r = self.client.get(url)
     pq = PyQuery(r.content)
     link_elements = pq("a.internal")
     self.urls_on_current_page = [l.attrib["href"] for l in link_elements]