예제 #1
0
파일: natgeo.py 프로젝트: Granitas/wallme
 def download(self, **kwargs):
     """
     :param position - position of image or defaults to random
     :param category - archive category, see get_categories for the list
     :return: dict{'content': <image_content>, <some meta data>...}
     """
     category = kwargs.get('category', None)
     position = kwargs.get('position', 0)
     rand = False
     if position == 0:
         rand = True
     if position > 1:
         position -= 1  # since 0 is reserved reduce position
     if not category:
         category = self.default_cat
     category = category.lower()
     url = self.url_tpl(category=category)
     response = requests.get(url)
     sel = Selector(text=response.text)
     # get position
     total_items = int(sel.xpath("//p[@class='count']").re('\d+')[0])
     items = sel.xpath("//div[@id='search_results']//a[img]/@href").extract()
     items_per_page = len(items)
     # find the right image by position
     if rand:
         position = random.randrange(0, total_items)
     if position < items_per_page:
         image = items[position]
     else:
         page = int(math.ceil(position / items_per_page))
         position -= items_per_page * (page - 1)
         url = "{}?page={}".format(url, page)
         response = requests.get(url)
         pos_sel = Selector(text=response.text)
         items = pos_sel.xpath("//div[@id='search_results']//a[img]/@href").extract()
         image = items[position]
     # retrieve image
     response = requests.get(urljoin(url, image))
     sel = Selector(text=response.text)
     image_url = sel.xpath("//div[@class='primary_photo']/a/img/@src").extract_first()
     image_url = utils.fix_url_http(image_url)
     meta = {
         'url': image_url,
         'title': sel.xpath("//div[@class='primary_photo']/a/img/@alt").extract_first(),
         'desc_title': sel.xpath("//div[@id='caption']/h2/text()").extract_first(),
         'desc': sel.xpath("//div[@id='caption']/p[not(@class)]/text()").extract_first(),
         'author': sel.xpath("//div[@id='caption']/p[@class='credit']/a/text()").extract_first(),
         'publication_date': sel.xpath("//div[@id='caption']/p[@class='publication_time']"
                                       "/text()").extract_first(),
     }
     image = Image(image_url, meta)
     return self.process_url(image, kwargs)
예제 #2
0
 def test_has_class_tab(self):
     body = u"""
     <p CLASS="foo\tbar">First</p>
     """
     sel = Selector(text=body)
     self.assertEqual(
         [x.extract() for x in sel.xpath(u'//p[has-class("foo")]/text()')],
         [u'First'])
예제 #3
0
def getimgsrc(pin_id):
    url = 'http://huaban.com/pins/%s/' % pin_id
    z = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})
    sel = Selector(text=z.text)
    jscode = sel.xpath("//script[contains(., 'app.page = app.page')]/text()").extract_first()
    parsed_js = js2xml.parse(jscode)
    for i in parsed_js.xpath('//property[@name="pins"]//property[@name="key"]/string/text()'):
        print 'http://img.hb.aicdn.com/' + i
def get_classes(html):
    doc = Selector(text=html)
    classes = set(doc.xpath('//*[@class]/@class').extract())
    result = set()
    for cls in classes:
        for _cls in cls.split():
            result.add(_cls)
    return result
예제 #5
0
파일: natgeo.py 프로젝트: Granitas/wallme
 def get_categories(self, response=None):
     if not response:
         response = requests.get(self.url)
     sel = Selector(text=response.text)
     categories = sel.xpath("//select[@id='search_category']"
                            "/option/text()").extract()
     categories = [c.split(' by ')[0].replace(' & ', '-')
                   for c in categories]
     return categories
def post(inputs):
    posted=[]
    failed=[]
    for week in inputs:
        try:data = urllib.request.urlopen(week).read()
        except urllib.error.URLError as e:
            failed.append(week)
            print(week)
            print(e.reason) 
        if type(data) is bytes:
            data = data.decode("utf-8") 
            hxs = Selector(text=data)
            posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract()
            posted.append(posts)
        else:
            hxs = Selector(text=data)
            posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract()
            posted.append(posts)
    return posted
def mon(inputs):
    week=[]
    errored_out=[]
    for month in inputs:
        try:data = urllib.request.urlopen(month).read()
        except urllib.error.URLError as e:
            print (month)
            errored_out.append(month)
            print(e.reason)
        if type(data) is bytes:
            data = data.decode("utf-8")      
            hxs = Selector(text=data)
            weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html')
            week.append(weeks)
        else:
            hxs = Selector(text=data)
            weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html')
            week.append(weeks)
    return week
예제 #8
0
 def test_has_class_simple(self):
     body = u"""
     <p class="foo bar-baz">First</p>
     <p class="foo">Second</p>
     <p class="bar">Third</p>
     <p>Fourth</p>
     """
     sel = Selector(text=body)
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
         [u'First', u'Second'])
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')],
         [u'Third'])
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')],
         [])
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')],
         [u'First'])
예제 #9
0
 def update_match_streams(self, matches: List[Match]) -> List[Match]:
     """Populate Match objects with stream urls"""
     updated = []
     for item in matches:
         # Populate stream data if match is live
         if not item['time_secs']:
             resp = self.session.get(item['url'])
             sel_detailed = Selector(text=resp.text)
             item['stream'] = sel_detailed.xpath("//div[@class='matches-streams']"
                                                 "/span[.//a[re:test(text(),'english', 'i')]]"
                                                 "//iframe/@src").extract_first()
             item['stream'] = clean_stream_url(item['stream'])
         updated.append(item)
     return updated
예제 #10
0
    def test_set_xpathfunc(self):

        def myfunc(ctx):
            myfunc.call_count += 1

        myfunc.call_count = 0

        body = u"""
        <p CLASS="foo">First</p>
        """
        sel = Selector(text=body)
        self.assertRaisesRegexp(
            ValueError, 'Unregistered function in myfunc',
            sel.xpath, 'myfunc()')

        set_xpathfunc('myfunc', myfunc)
        sel.xpath('myfunc()')
        self.assertEqual(myfunc.call_count, 1)

        set_xpathfunc('myfunc', None)
        self.assertRaisesRegexp(
            ValueError, 'Unregistered function in myfunc',
            sel.xpath, 'myfunc()')
예제 #11
0
def main(argv=None, progname=None):
    parser = argparse.ArgumentParser(prog=progname, description=__doc__)
    parser.add_argument('expr', metavar='EXPRESSION',
                        help="A CSSexpression, or a XPath expression if --xpath is given.")
    parser.add_argument('file', metavar='FILE', nargs='?',
                        help="If missing, it reads the HTML content from the standard input.")
    parser.add_argument('--xpath', action='store_true',
                        help="Given expression is a XPath expression.")
    parser.add_argument('--re', metavar='PATTERN',
                        help="Apply given regular expression.")
    parser.add_argument('--encoding', metavar='ENCODING', default='utf-8',
                        help="Input encoding. Default: utf-8.")
    parser.add_argument('--repr', action='store_true',
                        help="Output result object representation instead of as text.")
    # TODO: Output this and parsel version.

    args = parser.parse_args(argv)

    if args.file:
        text = open(args.file).read()
    else:
        text = sys.stdin.read()

    if isinstance(text, six.binary_type):
        try:
            text = text.decode(args.encoding)
        except UnicodeDecodeError:
            parser.error("Failed to decode input using encoding: %s" % args.encoding)

    sel = Selector(text=text)

    if args.xpath:
        result = sel.xpath(args.expr)
    else:
        result = sel.css(args.expr)

    if args.re:
        regex = args.re.encode(args.encoding)
        regex = regex.decode('string_escape' if six.PY2 else 'unicode_escape')
        out = result.re(re.compile(regex, re.IGNORECASE | re.UNICODE))
    else:
        out = result.extract()

    if args.repr:
        pprint.pprint(out)
    else:
        print("\n".join(out))

    return 0
 def get_url(self,file_path):
     with open(file_path, encoding='utf8') as f:
         text = f.read()
     html = Selector(text, type='html')
     result = html.xpath("//div[@class='tocList__title clearIt']//a[@class='ref nowrap']/@href").getall()
     isbn = html.xpath("//span[@class='bookInfo__isbn__print']/text()").getall()
     eisbn = html.xpath("//span[@class='bookInfo__isbn__pdf']/text()").getall()
     pages = html.xpath("//div[@class='tocList__pages']/text()").getall()
     accept_date = html.xpath("//span[@class='conf-date']/text()").extract_first('')
     meeting_name = html.xpath("//h1[@class='bookInfo__title']/text()").extract_first('')
     meeting_place = html.xpath("//span[@class='conf-loc']//text()").extract_first('')
     if result is None:
         return
     else:
         fw = open(r'D:\code\proceedings\big_json\20190812_1.big_json', 'a', encoding='utf-8')
         for i, item in enumerate(result):
             lists = []
             self.i +=1
             url = 'https://ascelibrary.org' + item
             name = re.findall('10.1061/(.*)', item)[0]
             page = pages[i]
             if eisbn != []:
                 if "ISBN (PDF)" in eisbn[0]:
                     eisbns = eisbn[0].replace("ISBN (PDF):","").replace("-","").strip()
                 else:
                     eisbns = ""
             else:
                 eisbns = ""
             if isbn != []:
                 if "ISBN (print)" in isbn[0]:
                     isbns = isbn[0].replace("ISBN (print):","").replace("-","").strip()
                 else:
                     isbns = ""
             else:
                 isbns = ""
             if meeting_place == "":
                 name_place = html.xpath("//div[@class='conf-date-loc']/text()").getall()[0]
                 temp = name_place.split("|")
                 meeting_place = temp[1]
                 accept_date = temp[0]
             lists = [url,name,isbns,eisbns,page,accept_date,meeting_name,meeting_place]
             self.write_info(lists,fw)
예제 #13
0
    def extract(self, html):
        """Extract data from lethain.com."""

        selector = Selector(html)
        page_div = selector.xpath('//div[@class="page"]')
        text_div = selector.xpath('//div[@class="text"]')

        return {
            'titles': [page_div.xpath('string(.//h2)').extract_first()],
            'dates':
            [page_div.xpath('.//span[@class="date"]/text()').extract_first()],
            'descriptions':
            [' '.join(text_div.xpath('string(.//p)').extract())],
            'tags':
            page_div.xpath('.//span[@class="tag"]/a/text()').extract(),
            'images':
            text_div.xpath('.//img/@src').extract(),
        }
예제 #14
0
def handler_detail_msm_sp(detail_urls_content, url):
    if '访问验证-安居客' not in detail_urls_content:
        # print(url)
        lat_lng = re.findall(r'lat: "(.*?)",.*?lng: "(.*?)"', detail_urls_content, re.S)
        real_lat_lng = lat_lng[0]
        xpath_css = Selector(text=detail_urls_content)
        house_facilities = xpath_css.xpath('//ul[@class="mod-peitao clearfix"]/li[not(contains(@class,"gray"))]')
        real_house_facilities = []
        for rs in house_facilities:
            one = rs.xpath('./p/text()').extract_first()
            real_house_facilities.append(one)
        sp_item = {}
        sp_houses = xpath_css.xpath('//*[@id="fy_info"]/ul/li')
        for house_msg in sp_houses:
            key1 = str(house_msg.xpath('./span[1]/text()').extract_first()).replace(':','')
            key = sp_house_config.get(house_msg.xpath('./span[1]/text()').extract_first().replace(':',''))
            print(str(house_msg.xpath('./span[2]').extract_first()).replace('\n','').replace(' ',''))
            sp_item[key1] = remove_tags(str(house_msg.xpath('./span[2]').extract_first()).replace('\n','').replace(' ',''))

        house_resources_l = xpath_css.xpath('//div[@class="itemCon clearfix"]/ul[@class="litem"]/li')
        for house_resource in house_resources_l:
            key1 = house_resource.xpath('./span[1]/text()').extract_first()
            key = sp_house_config.get(house_resource.xpath('./span[1]/text()').extract_first().replace(':',''))
            sp_item[key1] = remove_tags(str(house_resource.xpath('./span[2]').extract_first()).replace('\n','').replace(' ',''))
        house_resources_r = xpath_css.xpath('//div[@class="itemCon clearfix"]/ul[@class="ritem"]/li')
        for house_resource in house_resources_r:
            key1 = house_resource.xpath('./span[1]/text()').extract_first()
            key = sp_house_config.get(house_resource.xpath('./span[1]/text()').extract_first().replace(':',''))
            sp_item[key1] = remove_tags(str(house_resource.xpath('./span[2]').extract_first()))
        describes = xpath_css.xpath('//*[@id="xzl_desc"]/div').extract_first()
        real_describe = remove_tags(str(describes))
        shop_name = xpath_css.xpath('//div[@class="item-mod"]/h3/b/text()').extract_first().strip()
        print(shop_name)
        print(real_house_facilities)
        print(real_lat_lng)
        print(real_describe.strip())
        public_time = xpath_css.xpath('//*[@id="xzl_desc"]/h3/div/text()')[1].root
        house_number = xpath_css.xpath('//*[@id="xzl_desc"]/h3/div/text()')[2].root
        print(public_time, house_number)
        print(sp_item)
    else:
        print('有验证码')
예제 #15
0
def getHTMLId(page):
    try:
        '''
        page:网页页码
        return:返回详情页id,一个列表
        '''
        #url参数
        params = {'start': f'{25*page}'}
        r = requests.get(url, params=params, timeout=30)  #发送请求
        r.raise_for_status()  #获取网页状态
        #初始化生成一个XPath解析对象
        selectors = Selector(r.text)
        #使用XPath选取指定内容,返回列表
        detail_urls = selectors.xpath(
            '//div[@class="hd"]/a/@href').getall()  #返回一个列表
        return list(set(detail_urls))  #去重
    except:
        return ""
예제 #16
0
    def thsmn_test(self):
        # 同花顺模拟
        thsmn_base_url = "http://t.10jqka.com.cn/trace/trade/getLastEnOrHold/?"
        search_people_url = "http://t.10jqka.com.cn/trace/?page={0}&order=weight&show=pic".format(
            1)
        thsmn_text = self.resp_text(url=search_people_url, url_name="同花顺模拟页")
        thsmn_se = Selector(thsmn_text)
        people_num = thsmn_se.xpath(
            "//div[@id='sortshowtable']/ul/li/@data-zid").getall()
        if not people_num:
            return "同花顺模拟获取用户账号出问题"
        data = {'zidStr': ','.join([each_num for each_num in people_num])}
        thsmn_url = thsmn_base_url + urlencode(data) + '.html'
        response_text = self.resp_text(url=thsmn_url, url_name="同花顺模拟url")
        json_moni = json.loads(response_text).get("isT")

        if json_moni != True:
            return "同花顺模拟出问题"
예제 #17
0
def parse_content(node: Selector):
    s = ''
    for item in node.xpath('./node()'):
        if isinstance(item.root, str):
            s += item.root.strip()
        elif isinstance(item.root, lxml.html.HtmlElement):
            if item.root.tag == 'img':
                alt = item.root.attrib.get('alt', '')
                if bgm_face.match(alt):
                    s += alt
                    continue
            elif item.root.tag == 'div':
                s += unicodedata.normalize(
                    'NFKD',
                    item.get().strip(),
                ).replace('<br>\r\n', '\n')
            else:
                s += item.get().strip()
            # continue
            # if item.root.tag == 'br':
            #     s += '\n'
            # elif item.root.tag == 'img':
            #     alt = item.root.attrib.get('alt', '')
            #     if bgm_face.match(alt):
            #         s += alt
            #         continue
            #     link = item.root.attrib.get('src')
            #     s += f'[img]{link}[/img]'
            # elif item.root.tag == 'a':
            #     href = (item.root.attrib['href'])
            #     text = (item.xpath('./text()').extract_first())
            #     if href == text:
            #         s += f'[url]{href}[/url]'
            #     else:
            #         s += f'[url={href}]{text}[/url]'
            # elif item.root.tag == 'span':
            #     text = item.xpath('./text()').extract_first()
            #     for key, value in SPAN_BACK_MAP.items():
            #         if key in item.root.attrib.get('style'):
            #             s += f'[{value}]{text}[/{value}]'
            #             break
            # else:
            #     raise ValueError(item.root.tag + ' is not impl ed yet')
    return s
예제 #18
0
 def download(self, **kwargs):
     """
     Download and set image from wallhaven.cc
     :param position - position of image to choose from listed from 1 to 24,
     default is 0 = random.
     :param categories - categories to download from in 000 format, where every number
     represents binary for [general, anime, people] list.
     :param purity - purity of content in 000 format, where every number
     represents binary for [sfw, sketchy, _].
     :param sorting - sorting type from available see WallhavenDownloader.sorting_types .
     """
     # Make url from arguments
     order = 'desc'
     categories = kwargs.get('categories', '')
     purity = kwargs.get('purity', '')
     sorting = kwargs.get('sorting', '')
     page, position, rand = self._make_position(kwargs.get('position', 0))
     url = self.base_url
     for arg in ['categories', 'purity', 'sorting', 'order', 'page']:
         value = locals()[arg]
         if value:
             url = add_or_replace_parameter(url, arg, locals()[arg])
     # Download and parse items
     resp = requests.get(url)
     if resp.status_code != 200:
         self.logger.error('Failed to download image list {}'.format(resp.url))
         return
     list_sel = Selector(text=resp.text)
     items = list_sel.xpath("//section[@class='thumb-listing-page']//figure/a/@href").extract()
     item = random.choice(items) if rand else items[position - 1]
     resp = requests.get(item)
     if resp.status_code != 200:
         self.logger.error('Failed to download image page {}'.format(resp.url))
         return
     sel = Selector(text=resp.text)
     image_url = sel.xpath("//img[@id='wallpaper']/@src").extract_first()
     meta = {
         'id': sel.xpath("//img[@id='wallpaper']/@data-wallpaper-id").extract_first(),
         'tags': sel.xpath("//ul[@id='tags']//li/a/text()").extract(),
         'views': sel.xpath("//dt[contains(text(),'Views')]/following-sibling::dd[1]/text()").extract_first(),
         'favorites': sel.xpath("//dt[contains(text(),'Favorites')]"
                                "/following-sibling::dd[1]//text()").extract_first(),
         'res': sel.xpath("//h3/text()").extract_first(),
     }
     image = Image(image_url, meta)
     return self.process_url(image, kwargs)
예제 #19
0
 def get_problem(self, remote_oj, remote_problem):
     url = 'http://acm.zucc.edu.cn/problem.php?id={}'.format(remote_problem)
     res = self.request.get(url=url)
     selector = Selector(res.text)
     title = selector.xpath(
         '/html/body/div[1]/div[2]/div[1]/center/h3/text()').get('').split(
             ':')[1].strip()
     data = {
         'time_limit':
         float(
             selector.xpath(
                 '/html/body/div[1]/div[2]/div[1]/center/span[2]/span/text()'
             ).get('').strip()),
         'memory_limit':
         float(
             selector.xpath(
                 '/html/body/div[1]/div[2]/div[1]/center/text()[2]').get(
                     '').replace('MB', '').strip()),
         'description':
         selector.xpath(
             '/html/body/div[1]/div[2]/div[2]/div[1]/div[2]/text()').get(
                 '').strip(),
         'input':
         selector.xpath(
             '/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/text()').get(
                 '').strip(),
         'output':
         selector.xpath(
             '/html/body/div[1]/div[2]/div[2]/div[3]/div[2]/text()').get(
                 '').strip(),
         'sample_input':
         selector.xpath('//*[@id="sampleinput"]/text()').get('').replace(
             '\r', '').strip(),
         'sample_output':
         selector.xpath('//*[@id="sampleoutput"]/text()').get('').replace(
             '\r', '').strip()
     }
     return {
         'title': title,
         'description': self.problem_format.format(**data)
     }
    def THS_DATA(self) -> Dict:
        # 日期设置期限 (同花顺)
        time_test = time.strftime("%Y-%m-%d", time.localtime())
        # 名+净额字典
        name_jinge_dict = {}

        ths_response_list = self.__r.spider_ths()
        for each_ths_response in ths_response_list:
            se = Selector(each_ths_response)
            for each_table in se.xpath("//div[@class='zdph']/table"):
                date = each_table.xpath("//td[1]/text()").get()
                stock_name = each_table.xpath("//td[2]/a/text()").get()
                jinge = each_table.xpath("//td[7]/text()").get()
                if date != time_test:
                    pass
                else:
                    name_jinge_dict[stock_name] = jinge

        return name_jinge_dict
예제 #21
0
파일: get_city.py 프로젝트: qybing/PMI
def get_xinpan_detail(start_url_content):
    detail_urls_content = start_url_content
    if '访问验证-安居客' not in detail_urls_content:
        # lat_lng = re.findall(r'lat: "(.*?)",.*?lng: "(.*?)"', detail_urls_content, re.S)
        # real_lat_lng = lat_lng[0]
        xpath_css = Selector(text=detail_urls_content)
        item = {}
        # if 'zu' in url:
        house_msgs_l = xpath_css.xpath('//*[@id="container"]/div[1]/div[1]/div/div[2]/ul/li')[:-2]
        for house_msg in house_msgs_l:
            key1 = house_msg.xpath('./div[1]/text()').extract_first()
            if '楼盘特点' in key1:
                item[key1] = [ i for i in str(remove_tags(str(house_msg.xpath('./div[2]').extract_first()).replace('\n', ''))).strip().split(' ') if i]
            else:
            # key = house_config.get(house_msg.xpath('./span[1]/text()').extract_first())
                item[key1] = remove_tags(str(house_msg.xpath('./div[2]').extract_first()).replace('\n', '').replace(' ', ''))
        print(item)
    else:
        print('有验证码')
예제 #22
0
async def get_urls_in_playlist(session, playlist_url=''):
    """get each url of videos in playlist"""
    try:
        payload = await get_post_args(session)
        payload['playlist'] = playlist_url
    except:
        raise ExtractException('failed: extract playlist')

    async with session.post(
            "http://www.downvids.net/videoflv.php",
            proxy='socks5://127.0.0.1:1080',
            data=payload,  # todo: 有可能用data=json.dumps(payload)
    ) as res:
        text = await res.text()
        selector = Selector(text=text)
        video_urls = selector.xpath(
            "//span[@class='thumb vcard author']/a/@href").extract()
        for url in video_urls:
            yield url
예제 #23
0
def extract_character_names(sel: Selector) -> Iterable[AnimeCharacter]:
    """Extact the names of the anime characters."""
    # pylint: disable=line-too-long
    maybe_name_anchors = \
        sel.xpath("(//h2[contains(., 'Characters')]/following-sibling::div)[1]//a[not(./img)]")
    for maybe_name_anchor in maybe_name_anchors:
        href = maybe_name_anchor.attrib["href"]
        match = re.search(r"/character/\d+/(?P<name>[^/]+)$", href)
        if not match:
            continue
        name = re.sub(r"_+|\s+", " ", match.group("name")).strip()
        # pylint: disable=line-too-long
        role = get_all_text(
            maybe_name_anchor.xpath("./following-sibling::div/small"))
        if re.search(r"main", role, flags=re.IGNORECASE):
            role = "main"
        elif re.search(r"support(ing)?|secondary", role, flags=re.IGNORECASE):
            role = "secondary"
        yield AnimeCharacter(name=name, url=href, role=role)
예제 #24
0
 def _get_top_100(self):
     headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'
     }
     url = "http://www.gutenberg.org/browse/scores/top"
     response = requests.get(url, headers=headers)
     se = Selector(response.text)
     books = []
     for item in se.xpath(
             "//h2[@id='books-last1']/following-sibling::ol[1]/li"):
         href = item.xpath("./a/@href").extract_first()
         id = href.split("/")[-1]
         books.append({
             "Title": item.xpath("./a/text()").extract_first(),
             "Url": "http://www.gutenberg.org" + href,
             "GutenbergId": id
         })
     return books
    def _parse_spellcheck_items(cls, response):
        selector = Selector(text=response.text)
        content_part = selector.xpath(
            "/html/body/div[2]/div/div/div[2]/div[3]/div[1]/div[1]")

        return {
            'title':
            ''.join(content_part.css('h1 ::text').getall()),
            'description': [
                ''.join(p.css('::text').getall())
                for p in content_part.css('* > p')
            ],
            'recommend list': [{
                'word':
                ''.join(li.css('span ::text').getall()),
                'link':
                None if response.ok else li.xpath('a/@href').get()
            } for li in content_part.css('* > ul > li')]
        }
예제 #26
0
async def detail(**kwargs):
    session = kwargs['session']
    next_url = kwargs['next_url']
    title = kwargs['title']
    print(next_url)
    print(title)
    async with session.get(
            url=next_url,
            headers=HEADERS,
            proxy=PROXY_STR,
    ) as response:
        text = await response.text()
        resp = Selector(text=text)
        nodes = resp.xpath('//div[@class="kl2-1"]//img/@src').extract()
        nodes = list(set(nodes))
        for img in nodes:
            # print(img)
            await download_img(session=session, url=img, title=title)
            print('next image')
예제 #27
0
    def __init__(self, link:str):
        "ürün detayını trendyol'den dızlar."

        kaynak = "trendyol.com"

        if link.startswith('https://m.'):
            url = link.replace('https://m.', 'https://')
        elif link.startswith('https://ty.gl'):
            try:
                kisa_link_header = requests.get(link, headers=self.kimlik, allow_redirects=False).headers['location']
                url = self.ayristir("adjust_redirect=", "&adjust_t=", unquote(kisa_link_header))
            except KeyError:
                return None
        else:
            url = link

        try:
            istek = requests.get(url, headers=self.kimlik, allow_redirects=True)
        except requests.exceptions.ConnectionError:
            return None

        secici = Selector(istek.text)

        # affiliate = "https://tr.rdrtr.com/aff_c?offer_id=3107&aff_id=24172&url=" + quote(url) + "%26utm_source%3Daff_t%26utm_medium%3Dcps%26utm_campaign%3Dgelirortaklari%26utm_subaff%3D{aff_id}%26adjust_tracker%3D21ouxa_bfy1cc%26adjust_campaign%3Dperformics_tr%26adjust_adgroup%3D1%26adjust_label%3D{transaction_id}"

        try:
            trendyol_veri = {
                "link"       : url.split('?')[0],
                "marka"      : secici.xpath("//h1[@class='pr-new-br']/a/text()").get().strip() if secici.xpath("//h1[@class='pr-new-br']/a/text()").get() else secici.xpath("//h1[@class='pr-new-br']/text()").get().strip(),
                "baslik"     : secici.xpath("//h1[@class='pr-new-br']/span/text()").get().strip(),
                "resim"      : secici.xpath("//img[@class='ph-gl-img']/@src").get(),
                "gercek"     : secici.xpath("//span[@class='prc-org']/text()").get(),
                "indirimli"  : secici.xpath("//span[@class='prc-slg prc-slg-w-dsc']/text()").get() or secici.xpath("//span[@class='prc-slg']/text()").get(),
                "kampanya"   : secici.xpath("//div[@class='pr-bx-pr-dsc']/text()").get(),
                "son_fiyat"  : secici.xpath("//span[@class='prc-dsc']/text()").get(),
                "yorumlar"   : self.trendyol_yorum(url),
                # "link"       : self.link_kisalt.tinyurl.short(url.split('?')[0])
            }
        except AttributeError:
            trendyol_veri = None

        kekik_json = {"kaynak": kaynak, 'veri' : trendyol_veri}

        self.kekik_json  = kekik_json if kekik_json['veri'] != [] else None
        self.kaynak      = kaynak
예제 #28
0
    def parse_interval(self, response):

        interval_rides = "jQuery('#interval-rides').html(\""

        lines = response.text.split('\n')
        for line in lines:
            if line.startswith(interval_rides):
                content = line[len(interval_rides):-3]
                content = content.replace("\\n", "")
                content = content.replace("\\'", "'")
                content = content.replace('\\"', '"')
                content = content.replace("\\\\&quot;", "'")

                selector = Selector(text=content)
                activities = selector.xpath(
                    '//div[@class="content react-feed-component"]//@data-react-props'
                ).extract()
                for activity in activities:
                    # Read the JSON representation of the activity
                    try:
                        activity = activity.replace("\\", "")
                        activity_json = json.loads(activity)
                        if "activity" in activity_json:
                            # Check if it's backcountry skiiing and located in New England.
                            if activity_json["activity"][
                                    "type"] == "BackcountrySki":
                                activity_location = activity_json["activity"][
                                    "timeAndLocation"]["location"]
                                if any(state in activity_location
                                       for state in self.states):
                                    # Get the activity ID
                                    activity_id = activity_json["activity"][
                                        "id"]
                                    request = scrapy.Request(
                                        url=
                                        f"https://www.strava.com/activities/{activity_id}",
                                        dont_filter=True,
                                        callback=self.parse_activity)
                                    yield request

                    except:
                        self.logger.error(activity)
예제 #29
0
 def parse_html(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'aiaajournal', 2)
     result = []
     stmt = 'insert ignore into journal(journal_name,url,eissn,cover_url,active) Values(%s,%s,%s,%s,%s)'
     active = 0
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         if filename == 'active.html':
             active = 1
         else:
             active = 0
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         try:
             sel = Selector(text=text)
             for liTag in sel.xpath('//li[@class="search-item clearfix"]'):
                 journal_name = liTag.xpath(
                     './div/h4/a/text()').extract_first().strip()
                 url = liTag.xpath(
                     './div/h4/a/@href').extract_first().replace(
                         'journal', 'loi')
                 eissn = liTag.xpath(
                     './div/div/div/span[@class="meta__eissn"]/text()'
                 ).extract_first().replace('eISSN: ', '').strip()
                 cover_url = liTag.xpath(
                     './div/a/img/@src').extract_first().strip()
                 result.append(
                     (journal_name, url, eissn, cover_url, active))
             utils.printf(len(result))
         except:
             exMsg = '* ' + traceback.format_exc()
             print(exMsg)
             utils.logerror(exMsg)
             utils.logerror(fullname)
             return
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_index')
    def spider_thsmn(self) -> Any:
        base_url = "http://t.10jqka.com.cn/trace/trade/getLastEnOrHold/?"
        time_test = time.strftime("%Y%m%d", time.localtime())

        stock_name_list = []  # 装载股票名
        stock_BS_list = []  # 装载BS
        stock_BS_times_list = []  # 装载BS次数

        # http://t.10jqka.com.cn/trace/trade/getLastEnOrHold/?zidStr=60503016,44983608,36010761,56395121,58626061,40787461,47391107,62884256,25869277,37401557
        # 获取账户名
        for page in range(1, 4):
            search_people_url = "http://t.10jqka.com.cn/trace/?page={0}&order=weight&show=pic".format(
                page)
            text = self.resp_text(search_people_url)
            se = Selector(text)
            people_num = se.xpath(
                "//div[@id='sortshowtable']/ul/li/@data-zid").getall()
            data = {'zidStr': ','.join([each_num for each_num in people_num])}
            new_url = base_url + urlencode(data) + '.html'
            # 再次请求
            response_text = self.resp_text(new_url)
            json_moni = json.loads(response_text)
            result_moni = json_moni.get("result")

            for each_count_num in people_num:
                each_stock_mesg = result_moni.get(each_count_num)
                if each_stock_mesg:
                    # 上传时间
                    updata_time = each_stock_mesg['wtrq']
                    # 股票名
                    stock_name = each_stock_mesg['zqmc']
                    # 买卖
                    BS = each_stock_mesg['mmlb']
                    # 股数
                    stock_times = each_stock_mesg['wtsl']
                    if updata_time == time_test:
                        stock_name_list.append(stock_name)
                        stock_BS_list.append(BS)
                        stock_BS_times_list.append(stock_times)
                    else:
                        pass
        return stock_name_list, stock_BS_list, stock_BS_times_list
예제 #31
0
 def parse_html(self,message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'pishuinfo', 4)
     result = []
     stmt = 'insert ignore into video(video_id,stat) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         sel = Selector(text=text)
         for href in sel.xpath('//*[@id="TitleList"]/div/a/@href'):
             video_id = href.re('.*ID=(\d+)&isHost=.*')[0]
             result.append((video_id, 0))
             utils.printf(len(result))
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_index')                  
예제 #32
0
def lhc_wiki_events():
    LHC_WIKI_EVENTS_URL = "https://lhc.net.br/wiki/Categoria:Eventos"
    response = requests.get(LHC_WIKI_EVENTS_URL)
    selector = Selector(text=response.text)
    raw_events = selector.xpath(
        "//script[contains(text(), 'window.eventCalendarData.push')]/text()"
    ).re_first(r"window.eventCalendarData.push\((.*)\)")
    events = json.loads(raw_events)

    lhc_events = []
    for event_data in events:
        event = Event(
            name=event_data.get("title"),
            begin=event_data.get("start"),
            end=event_data.get("end"),
            url=urljoin("https://lhc.net.br", event_data.get("url", "")),
            location="Laboratório Hacker de Campinas",
        )
        lhc_events.append(event)
    return lhc_events
예제 #33
0
 def getimagelist_href(link: str):
     __image_list = []
     try:
         print("Collecting images in the link: " + link)
         __response = requests.get(link, timeout=10)
         __selector = Selector(__response.text)
         if __response.status_code == 200:
             __image_list = __selector.xpath('//img/@src').getall()
             print("Images collected!")
     except Exception as exp:
         print("Error in the link")
     __new_list = []
     for i in __image_list:
         if not (i[0:1] == "/"):
             __new_list.append(link + "/" + i)
         else:
             __new_list.append(link + i)
     print("Done!")
     print(__new_list)
     return __new_list
예제 #34
0
def get_global_data_BSV():
    """
    拿到bsv价格以及 每T/1天的收益
    """
    ##TODO:需要判断是否为btc...其他的币 需要别的获取方法...
    logger.info("爬取bsv每T每天的收益")
    url = "https://explorer.viawallet.com/bsv"
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    z = requests.get(url, headers=headers, timeout=60)
    sel = Selector(text=z.text)
    jscode = sel.xpath(
        '//script[contains(.,"coin_per_t_per_day")]/text()'
    ).extract_first()
    parse_js = js2xml.parse(jscode)
    mining_payoff_btc = float(
        parse_js.xpath('//*[@name="coin_per_t_per_day"]/string/text()')[0].strip()
    )
    return mining_payoff_btc
예제 #35
0
파일: scrapy.py 프로젝트: Hope-wind/python
    async def parse_data(self, session, html):
        '''处理数据'''
        selector = Selector(html)
        result_list = selector.xpath('//a[@class="col-xs-6 col-sm-3"]')

        for result in result_list:
            img_url = result.xpath('./img/@data-original').extract_first()
            img_title = result.xpath('./img/@alt').extract_first()

            all_title = img_title + '.' + img_url.split('.')[-1]

            content = await self.fetch_img(session, img_url)

            try:
                with open(path + "\\" + all_title, mode='wb') as f:
                    print("下载完成:", all_title)
                    f.write(content)

            except Exception as e:
                print(e)
예제 #36
0
def generate_sections(data):
    sections = []

    if len(data) > 0:
        sections.append(Section(index=0, title="Todos", subsections=[]))
        for index, element in enumerate(data):
            sel = Selector(text=element)
            title_text = sel.xpath("//h4/text()").get(
            )  # TODO: checar se esse xpath retorna o child e, caso não retornar, usar só "h4/text()"
            if title_text is not None and title_text != "":
                subsections = generate_subsections(element)
                section = Section(
                    index=index + 1,
                    title=title_text,
                    subsections=subsections,
                    source=data,
                )
                sections.append(section)

    return sections
예제 #37
0
 def parse_oddsList(self, response):
     # #team_fight_table tr[class!=LotteryListTitle
     play_list = response.css(
         "#team_fight_table tr:not(.LotteryListTitle)").extract()
     for p in play_list:
         play_sel = Selector(text=p)
         # 取matchid
         matchid = play_sel.xpath("//@matchid").extract_first()
         # 取odds_url
         odds_url = "/soccer/match/" + matchid + "/odds/"
         playInfo = copy.deepcopy(response.meta["playInfoObj"])
         playInfo["id"] = matchid
         playInfo["play_urls"] = odds_url
         yield scrapy.Request(url=self.base_url + odds_url,
                              headers=self.headers,
                              meta={
                                  'cookiejar': response.meta['cookiejar'],
                                  "playInfoObj": playInfo
                              },
                              callback=self.parse_playInfo)
예제 #38
0
def scrap_profiles(driver):
    """
    select the profile from the webpage and scrap it.
    """
    try:
        sel = Selector(text=driver.page_source)
        root = driver.find_element_by_class_name("pv-top-card")
        name = root.find_elements_by_xpath(
            "//section/div/div/div/*/li")[0].text.strip()
        job_title = sel.xpath('//h2/text()').getall()[1]
        ln_url = driver.current_url

        # upsert to Employee Model
        Employee.objects.get_or_create(name=name,
                                       designation=job_title.strip(),
                                       company='Mambu')
        time.sleep(5)
    except:
        print('failed to scrape profile')
        pass
예제 #39
0
def crawl_qoo10(keyword, num=10):
    url = 'https://www.qoo10.sg/s/' + keyword + '?keyword=' + keyword + '&keyword_auto_change='
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    parser = soup.find_all('div', attrs={"class": "bd_lst_item"})
    itemlist = parser[1].select('tr')
    itemlist = itemlist[2:-1]

    count = 0
    finalitemlist = []

    for item in itemlist:
        selector = Selector(str(item))
        href_links = selector.xpath('//a/@href').getall()
        for i in href_links:
            if str(i) != '#none' and str(i) != "#":
                link = str(i)
                break

        titles = str(item).split('title="')[1:]
        for j in titles:
            formatted = j.split('"')[0]
            if str(formatted) != 'Click to Play Video':
                title = str(formatted)
                break

        price = item.select('.prc')
        price = str(price).split('strong>')[1][:-2]
        price = price[2:]
        price = price.replace(',', '')

        finalitemlist.append((title, float(price), link))
        count += 1

        if count == num:
            break

    sorted_itemList = sorted(finalitemlist, key=lambda x: x[1])

    return sorted_itemList
예제 #40
0
 def parse_type(self, response):
     # 解析比赛类型
     logging.debug(response.url)
     sch_type = response.css("div#m_id").extract()
     type_len = len(sch_type)
     if type_len == 0:
         # 没有类型
         scheduleInfo = copy.deepcopy(response.meta["scheduleInfoObj"])
         logging.debug(scheduleInfo)
         scheduleInfo["sch_type"] = "无"
         scheduleInfo["id"] = response.meta["scheduleInfoObj"]["id"] + "_0"
         yield scrapy.Request(url=buildRandomUrl(response.url),
                              headers=self.headers,
                              meta={
                                  'cookiejar': 1,
                                  "scheduleInfoObj": scheduleInfo
                              },
                              callback=self.parse_group)
     else:
         for type in sch_type:
             #
             scheduleInfo = copy.deepcopy(response.meta["scheduleInfoObj"])
             logging.debug(scheduleInfo)
             #
             logging.debug("sch_type html: " + type)
             sch_type_sel = Selector(text=type)
             name = sch_type_sel.css("a::text").extract_first()
             if name == None or name == "" or name == "null":
                 continue
             url = sch_type_sel.xpath("//a/@href").extract_first()
             scheduleInfo["sch_type"] = name
             scheduleInfo[
                 "id"] = response.meta["scheduleInfoObj"]["id"] + "_" + name
             #
             yield scrapy.Request(url=self.base_url + url,
                                  headers=self.headers,
                                  meta={
                                      'cookiejar': 1,
                                      "scheduleInfoObj": scheduleInfo
                                  },
                                  callback=self.parse_group)
예제 #41
0
 def parse_html(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'ydylcnbook', 4)
     result = []
     stmt = 'insert ignore into book(bookid,cover_url) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         sel = Selector(text=text)
         for aTag in sel.xpath('//ul[@class="list-book-1"]/li/a'):
             bookid = aTag.xpath('./@href').extract_first().split('=')[-1]
             cover_url = aTag.xpath('./div/div/img/@src').extract_first()
             result.append((bookid, cover_url))
             utils.printf(len(result))
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_list')
예제 #42
0
async def main():
    async with aiohttp.ClientSession() as session:
        response = await session.get(
            'https://kartochki-domana.com.ua/ru/product-category/podarochnie-nabori/'
        )
        html = await response.text()

        sel = Selector(text=html)
        prod_urls = sel.xpath('//h3[@class="product_title"]/a/@href').getall()

        [(await queue.put(prod_url), added_prod_urls.add(prod_url))
         for prod_url in prod_urls if prod_url not in added_prod_urls]

        print(queue.qsize())
        # pprint(added_prod_urls)

        tasks = []
        for _ in range(50):
            task = asyncio.Task(worker(session))
            tasks.append(task)
        await asyncio.gather(*tasks)
예제 #43
0
 def _find_match(self, sel: Selector) -> Match:
     xpath = lambda x: sel.xpath(x).extract_first(default='').strip()
     item = Match()
     item['url'] = urljoin(self.url_base, xpath(".//a/@href"))
     item['id'] = (re.findall('matches/(\d+)', item['url']) or [None])[0]
     item['game'] = next((g for g in self.games if g in item['url'].lower()))
     item['time'] = xpath("td[@class='status']/span/text()")
     item['time_secs'] = time_to_seconds(item['time'])
     item['timestamp'] = int((datetime.now() + timedelta(item['time_secs'])).timestamp())
     item['t1'] = xpath(".//span[contains(@class,'opp1')]/span/text()")
     item['t1_country'] = xpath(".//span[contains(@class,'opp1')]/span[contains(@class,'flag')]/@title")
     item['t1_country_short'] = xpath(".//span[contains(@class,'opp1')]"
                                      "/span[contains(@class,'flag')]/@class").split()[-1]
     item['t2'] = xpath(".//span[contains(@class,'opp2')]/span/text()")
     item['t2_country'] = xpath(".//span[contains(@class,'opp2')]/span[contains(@class,'flag')]/@title")
     item['t2_country_short'] = xpath(".//span[contains(@class,'opp2')]"
                                      "/span[contains(@class,'flag')]/@class").split()[-1]
     scores = sel.css('.score::text').extract()
     item['t1_score'] = scores[0] if scores else None
     item['t2_score'] = scores[1] if len(scores) > 1 else None
     return item
            posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract()
            posted.append(posts)
        else:
            hxs = Selector(text=data)
            posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract()
            posted.append(posts)
    return posted
if __name__ == '__main__':
    print("in main")
    totalWeeks = []
    totalPosts = []
    url = 'http://www.businessweek.com/archive/news.html#r=404'
    data = urllib.request.urlopen(url).read()
    data = data.decode("utf-8") 
    sel = Selector(text=data)
    months = sel.xpath('//ul/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news.html')
    #admittMonths = 12*(2015-1991) + 8
    m=[]
    for i in months:
        m.append([i])
    totalWeeks = []
    pool = Pool(8)
    totalWeeks= pool.map(mon,m)
    totalWeeks = [ent for sublist in totalWeeks for ent in sublist]
    print (len(totalWeeks))
    #club = [ent for sublist in totalWeeks for ent in sublist]
    #print (len(club))
    club = [ent for sublist in totalWeeks for ent in sublist]
    print (len(club))
    d=[]
    for i in club:
         pass
     except urllib.error.HTTPError:
         pass
     except timeout:
         pass
 else:
     fail.append(s[i]) 
     print ("failed to retive info from ",s[i],i)
     flag = True
 if flag ==True:
     pass
 else:
     clap = response.read()
     clap = clap.decode("utf-8") 
     h = Selector(text=clap)
     date = h.xpath('//meta[@content][@name="pub_date"]/@content').extract()
     if date:
         pass
     else:
         date = h.xpath('//meta[@content][@name="parsely-pub-date"]/@content').extract()
     key = h.xpath('//meta[@content][@name="keywords"]/@content').extract() 
     info = h.xpath('//div[@id = "article_body"]/p//text()').extract()
     if not info:
         info = h.xpath('//div[@class = "article-body__content"]/p//text()').extract()
     if len(info)>1:
         info = ' '.join(str(r) for r in info)
         info = info.replace(u"\xa0", u" ")
     if "T" in date[0]:
         date,t = date[0].split('T')
     else:
         date = date[0]
예제 #46
0
 def test_make_links_absolute(self):
     text = u'<a href="file.html">link to file</a>'
     sel = Selector(text=text, base_url='http://example.com')
     sel.root.make_links_absolute()
     self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first())
예제 #47
0
 def find_history(self, sel: Selector) -> Generator[Match, None, None]:
     """
     Generator to find recent matches in parsel.Selector object
     :returns: Generator for Match objects
     """
     yield from self._find_matches(sel.xpath("//h2[contains(text(),'Recent')]/..//tr"))
예제 #48
0
 def find_matches(self, sel: Selector) -> Generator[Match, None, None]:
     """
     Generator to find live and upcoming matches in parsel.Selector object
     :returns: Generator for Match objects
     """
     yield from self._find_matches(sel.xpath("//table[@id='gb-matches']//tr"))