Python Selector.replace示例，scrapy.selector.Selector.replace Python示例

示例#1

0

显示文件

    def parse(self, response):
        dirname = os.sep.join(['root'] + response.url.split('/')[2:-1])
        filename = os.sep.join([dirname, response.url.split('/')[-1]])
        article_text = Selector(response).xpath(
            '//div[@class="post"]').extract()[0]

        parser = Selector(text=article_text)

        article_title = parser.xpath(
            '//a[@id="cb_post_title_url"]/text()').extract()[0]
        title_link = parser.xpath(
            '//a[@id="cb_post_title_url"]/@href').extract()[0]

        article_text = article_text.replace(title_link, title_link[6:])

        item = ArticleItem()
        item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()]
        item['image_names'] = [x.split('/')[-1] for x in item['image_urls']]

        # process image links.
        for url in item['image_urls']:
            article_text = article_text.replace(url, url[6:])

        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(filename, 'wb') as fp:
            fp.write(self.html_start_l + article_title.encode('utf-8') +
                     self.html_start_r +
                     article_text.encode('utf-8', 'ignore') + self.html_end)

        return item

示例#2

0

显示文件

文件： article.py 项目： sighingnow/Spider-Utils

    def parse(self, response):
        dirname = os.sep.join(['root'] + response.url.split('/')[2:-1])
        filename = os.sep.join([dirname, response.url.split('/')[-1]])
        article_text = Selector(response).xpath('//div[@class="post"]').extract()[0]

        parser = Selector(text = article_text)

        article_title = parser.xpath('//a[@id="cb_post_title_url"]/text()').extract()[0]
        title_link = parser.xpath('//a[@id="cb_post_title_url"]/@href').extract()[0]

        article_text = article_text.replace(title_link, title_link[6:])

        item = ArticleItem()
        item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()]
        item['image_names'] = [x.split('/')[-1] for x in item['image_urls']]

        # process image links.
        for url in item['image_urls']:
            article_text = article_text.replace(url, url[6:])

        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(filename, 'wb') as fp:
            fp.write(self.html_start_l + article_title.encode('utf-8') + self.html_start_r + article_text.encode('utf-8', 'ignore') + self.html_end)

        return item

示例#3

0

显示文件

    def parse(self, response):
        print('################################')
        # print(response.url.split('=', 3)[1])
        # type
        type = '招标公告'
        if response.url.split('=', 3)[1] == 'W004_001&page':
            type = '中标公告'

        next_page = response.xpath(
            "/html/body/div[@class='cover']/div[@class='main']/div[@class='main-advert']/div[@class='main-cont']/div[@class='list_right']/div[@class='pager']/span/font/text()"
        ).extract()[0]
        print('################################page===' + next_page +
              '===Page################################')
        next_page = str(int(next_page) + 1)
        total_page = response.xpath(
            "/html/body/div[@class='cover']/div[@class='main']/div[@class='main-advert']/div[@class='main-cont']/div[@class='list_right']/div[@class='pager']/span/font/text()"
        ).extract()[1]
        next_url = 'http://www.tjgpc.gov.cn/webInfo/getWebInfoListForwebInfoClass.do?fkWebInfoclassId=W004_001&page=' + next_page + '&pagesize=10'
        # print(total_page)
        # print(next_url)
        for tr in response.xpath(
                "/html/body/div[@class='cover']/div[@class='main']/div[@class='main-advert']/div[@class='main-cont']/div[@class='list_right']/div[@class='cur']/table/tr"
        ).extract():
            title = Selector(
                text=tr).xpath('//td[2]/a[@class]/text()').extract()[0]
            category = Selector(
                text=tr).xpath('//td[2]/a[1]/text()').extract()[0]
            category = category.replace('[', '')
            category = category.replace(']', '')
            url = Selector(text=tr).xpath('//td[2]/a[2]/@href').extract()[0]
            # print(title)
            # print(tr)
            # print(category)
            # print(url)
            title = title.replace('成交结果公告', '')
            if len(title) > 3:
                temp = title[0:4]
                # print(temp)
                if (temp == '天津市'):
                    title = title.replace(temp, '')

            issue_at = Selector(text=tr).xpath('//td[3]/text()').extract()[0]
            issue_at = issue_at.replace('[', '')
            issue_at = issue_at.replace(']', '')
            # print('######'+'######'+title+'######'+category+'######'+url+'######'+time+'######')
            yield scrapy.Request(url,
                                 callback=self.parse_item,
                                 meta={
                                     "title": title,
                                     "type": type,
                                     "url": url,
                                     "issue_at": issue_at,
                                     "category": category
                                 })

        if (int(next_page) < int(total_page)):
            yield scrapy.Request(next_url,
                                 callback=self.parse,
                                 dont_filter=True)

示例#4

0

显示文件

    def parse(self, response):
        # get all the listing blocks
        listings = response.xpath('//a[@class="col-xs-12 profitem"]').getall()

        # within each listing block get the details
        for i in listings:
            # there is more than 1 heading or suburb, just get the first one
            suburb = Selector(text=i).xpath(
                '//h4[@class="mat-header"]/text()').get().strip()
            # new or updated listing
            status = Selector(text=i).xpath(
                '//span[@class="mat-text-span text-uppercase mat-new hidden-xs"]/text()'
            ).get()

            # price
            price = Selector(
                text=i).xpath('//h4[@class="mat-header mat-price"]').get()
            # some regex to extract the price
            loc = re.search("</sup>", price)
            price = price[loc.span()[1]:]
            price = price.replace('<sup>', '')
            price = price.replace('</sup>', '')
            price = price.replace('</h4>', '')
            price = re.sub('\xa0', ' ', price)
            price = price.strip()

            # get all feature details in a list
            details = Selector(text=i).xpath(
                '//ul[@class="mat-feture"]/li/div[@class="mat-fetaure-avl"]/text()'
            ).getall()
            # listing details
            home_type = details[0].strip()
            available = details[1].strip()
            occupants = details[2].strip()

            # get description
            desc = Selector(text=i).xpath(
                '//div[@class="col-sm-4 col-md-6 hidden-xs hidden-sm mathes-list"]/p/text()'
            ).get().strip()
            desc = desc.replace('\r', '')
            desc = desc.replace('\n', '')

            listing = {
                'suburb': suburb,
                'status': status,
                'price': price,
                'home_type': home_type,
                'available': available,
                'occupants': occupants,
                'description': desc,
            }
            yield (listing)

示例#5

0

显示文件

    def parse(self, response):
        dirname = os.sep.join(['root'] + response.url.split('/')[2:-1])
        filename = os.sep.join(
            [dirname, response.url.split('/')[-1] + '.html'])
        # parse artitle text.
        article_text = Selector(response).xpath(
            '//div[@id="article_details"]').extract()[0]

        parser = Selector(text=article_text)

        # parse artile title.
        article_title = parser.xpath(
            '//span[@class="link_title"]/a/text()').extract()[0]
        article_links = parser.xpath(
            '//a[re:test(@href, "[^/]+/article/details/\d+")]/@href').extract(
            )

        # replace links.
        article_text = article_text.replace(
            'http://static.blog.csdn.net/css/blog_detail.css',
            '/static.blog.csdn.net/css/blog_detail.css')
        for link in article_links:
            article_text = article_text.replace(
                link, '/blog.csdn.net' + link + '.html')

        item = ArticleItem()
        item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()]
        # handle such image(with watermark) url:
        #    http://img.blog.csdn.net/20140917165912117?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvaWFpdGk=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast
        item['image_names'] = [
            (lambda k: k
             if '?' not in k else k.split('?')[0] + '.png')(x).split('/')[-1]
            for x in item['image_urls']
        ]

        # process image links.
        for url in item['image_urls']:
            article_text = article_text.replace(
                url, (lambda k: k
                      if '?' not in k else k.split('?')[0] + '.png')(url)[6:])

        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(filename, 'wb') as fp:
            fp.write(self.html_start_l + article_title.encode('utf-8') +
                     self.html_start_r +
                     article_text.encode('utf-8', 'ignore') + self.html_end)

        return item

示例#6

0

显示文件

文件： Spider_Aiuw.py 项目： hkruni/Spider

    def parse(self,response):

        
        item = Aiuw_Item()
        
        item['site'] = response.url
        title = Selector(response).xpath("//span[@id='imgExplain']/text()").extract()
        if len(title) > 0 :
            item['title'] = title[0]
        else :
            return
        
        tags = Selector(response).xpath("//div[@class='tag']/a/text()").extract()
        if len(tags) > 0 :
            item['tag'] = ','.join(tags)
        else :
            item['tag'] = ''
            
        url = Selector(response).xpath("//div[@class='img_boxlist up userSelectNone']/img/@src").extract()[0]
        item['origin_url'] = url.replace("zip@q80", "zip@w400")
        item['new_url']=''
        item['mb']=''
        item['pixel']=''
        item['format']=url.split("/")[-1].split('.')[-1]
        return item;

示例#7

0

显示文件

文件： spider.py 项目： lixiang0/xwlb

    def parse(self, response):
        file_name = self.PATH + response.url.split(
            '/'
        )[-1] if 'index' not in response.url else self.PATH + response.url.split(
            '/')[-2] + '.shtml'
        with open(file_name, 'wb') as (writer):
            writer.write(response.body)
        html = response.xpath('//div[@id="contentELMT1368521805488378"]').get()
        lis = None
        if html is None:
            lis = response.xpath('//li/a').getall()
        else:
            lis = Selector(text=html).xpath('//li/a').getall()
        for li in [lis[0]]:
            try:
                url = Selector(text=li).xpath('//a[@href]').attrib['href']
                for url in [url]:
                    url = 'http://www.cctv.com' + url if url[0] == '/' else url
                    url = url.replace('news.cntv.cn', 'tv.cctv.com')
                    yield SplashRequest(url,
                                        callback=(self.parse_sumary),
                                        endpoint='render.html',
                                        args={
                                            'wait': 2,
                                            'http_method': 'GET'
                                        },
                                        headers=(self.headers))
            except Exception:
                continue

        for li in lis[1:]:
            try:
                url = Selector(text=li).xpath('//a[@href]').attrib['href']
                for url in [url]:
                    url = 'http://www.cctv.com' + url if url[0] == '/' else url
                    url = url.replace('news.cntv.cn', 'tv.cctv.com')
                    yield SplashRequest(url,
                                        callback=(self.parse_detail),
                                        endpoint='render.html',
                                        args={
                                            'wait': 2,
                                            'http_method': 'GET'
                                        },
                                        headers=(self.headers))
            except Exception as e:
                continue

示例#8

0

显示文件

文件： helper.py 项目： gtsueng/biothings.crawler

def pmid_to_citation(pmid):
    '''
    Use pmid to find citation string
    '''
    url = 'https://www.ncbi.nlm.nih.gov/sites/PubmedCitation?id=' + pmid
    body = requests.get(url, timeout=5).text
    citation = Selector(text=body).xpath('string(/)').get()
    return citation.replace(u'\xa0', u' ')

示例#9

0

显示文件

 def video_parse(self, response):
     item = VideoItem()
     user_id = response.meta['user_id']
     text = json.loads(response.text)
     html = text['data'].get("html", "")
     video_list = Selector(text=str(html)).xpath("//a/figure").extract()
     for v in video_list:
         data_date = Selector(text=v).xpath("//@data-date").extract()[0]
         data_url = Selector(text=v).xpath("//@data-url").extract()[0]
         data_title = Selector(text=v).xpath("//@data-title").extract()[0]
         watch_number = Selector(text=v).xpath("//p[@class='crumbs']//span[@class='nums']/text()").extract()[0]
         barrage_number = Selector(text=v).xpath("//p[@class='crumbs']//span[@class='nums']/text()").extract()[1]
         item['user_id'] = int(user_id)
         item['title'] = data_title
         item['watch_number'] = int(watch_number.replace(".", "").replace("万", ""))
         item['barrage_number'] = int(barrage_number.replace(".", "").replace("万", ""))
         item['time'] = datetime.datetime.strptime(data_date, "%Y/%m/%d")
         item['data_url'] = data_url
         yield item

示例#10

0

显示文件

 def parse_list(self , response):
     print('################################Page==='+re.search("\d+",response.url).group(0)+'===Page################################')
     
     for li in response.xpath("/html/body/div[@class='con_row']/div[@class='list_right f_l']/div[@class='search_list_con gg_list']/ul/li").extract():
         title = Selector(text=li).xpath('//a/text()').extract()[0]
         issue_at = Selector(text=li).xpath("//span[@class='search_list_time']/text()").extract()[0]
         url = Selector(text=li).xpath('//a/@href').extract()[0]
         url = 'http://www.hebeieb.com'+url
         category = Selector(text=li).xpath("//div[@class='search_list_biaoqian']/span[1]/text()").extract()[0]
         category = category.replace('行业：','')
         city = Selector(text=li).xpath("//div[@class='search_list_biaoqian']/span[2]/text()").extract()[0]
         city = city.replace('地区：','')
         type = '招标公告'
         # print(title)
         # print(issue_at)
         # print(url)
         # print(category)
         # print(city)
         yield scrapy.Request(url, callback=self.parse_item,meta={"title":title,"type":type,"url":url,"issue_at":issue_at,"city":city,"category":category})

示例#11

0

显示文件

 def article_content(self, response):
     item = response.meta['item']
     title = Selector(text=response.body).xpath('//h2[@id="activity-name"]/text()').extract()[0].encode(
         'utf-8').strip()
     content = Selector(text=response.body).xpath('//*[@id="js_article"]/div[@class="rich_media_inner"]').extract()[
         0].encode('utf-8')
     content = content.replace('\r\n', '').strip()
     item['title'] = title
     item['content'] = content
     return item  # 返回item，执行数据库操作

示例#12

0

显示文件

文件： com_zjpubservice_www.py 项目： baichuan-hailong/Html-JS

    def parse(self, response):
        for li in response.xpath("/html/body/form[@id='jyform']/div[@class='clearfix']/div[@id='jytypetext']/div[@class='clearfix  isshowdisplay']/div[@class='l']/div[@class='infor-bd clearfix']/ul[@class='infor-items']/div[@id='jyform:refreshData']/div[@id='jyform:refreshData_content']/table[@class='ui-datagrid-data']/tbody//tr[@class='ui-datagrid-row']/td[@class='ui-datagrid-column']/li[@class='notice-item infor-item clearfix']").extract():
            # print(li)
            title =  Selector(text=li).xpath("//div[@class='notice-block l']/a/text()").extract()[0]
            # \n\t\t
            title = title.replace('\n','')
            title = title.replace('\t','')

            type = '招标公告'
            city =  Selector(text=li).xpath("//span[@class='infro-span'][1]/text()").extract()[0]
            city = city.replace('【','')
            city = city.replace('】','')
            issue_at =  Selector(text=li).xpath("//span[@class='notice-date ']/text()").extract()[0]
            url =  Selector(text=li).xpath("//div[@class='notice-block l']/a/@href").extract()[0]
            url = 'http://www.zjpubservice.com'+url
            # print(str(title))
            # print(city)
            # print(issue_at)
            print(url)

示例#13

0

显示文件

文件： timberland.py 项目： radarlabs/alltheplaces

    def parse(self, response):
        xxs = Selector(response)

        pois = xxs.xpath('//poi').extract()

        for poi in pois:
            state = Selector(text=poi).xpath('//state/text()').get()
            if state == None:
                state = Selector(text=poi).xpath('//province/text()').get()

            addr = Selector(text=poi).xpath('//address1/text()').get()
            if addr == None:
                addr = Selector(text=poi).xpath('//address2/text()').get()
                if addr == None:
                    addr = Selector(text=poi).xpath('//dsply_adr/text()').get()

            name = Selector(text=poi).xpath('//name/text()').get()
            name = name.replace('<br>', '')
            name = name.replace('&reg', ' ')
            name = name.replace(';', '')
            name = name.replace('  ', ' ')

            properties = {
                'ref': Selector(text=poi).xpath('//clientkey/text()').get(),
                'name': name,
                'addr_full': addr,
                'city': Selector(text=poi).xpath('//city/text()').get(),
                'state': state,
                'postcode':
                Selector(text=poi).xpath('//postalcode/text()').get(),
                'country': Selector(text=poi).xpath('//country/text()').get(),
                'lat': Selector(text=poi).xpath('//latitude/text()').get(),
                'lon': Selector(text=poi).xpath('//longitude/text()').get(),
                'phone': Selector(text=poi).xpath('//phone/text()').get(),
                'extras': {
                    'brand': "Timberland"
                }
            }

            yield GeojsonPointItem(**properties)

示例#14

0

显示文件

def Vacancy_info(url):
    url = url.replace("/en/", "/ge/")
    print(url)
    page = requests.get(url)


    # Description
    try:
        description = Selector(response=page).xpath('//*[@id="job"]/table/tr[1]/td/table[2]').get()
        description = remove_tags(description)
        description = description.rstrip()
        description = description.lstrip()
        description = description.replace('*', "")
        description = re.sub(r"\s+", " ", description)
        print(description)
    except:
        description = ""
    if detect(description) == "ru":
        description_ru = description
        description_en = Translate(description)
        description_ka = ""
    elif detect(description) == "et":
        description_ru = ""
        try: 
            description_en = Translate(description)
        except:
            description_en = ""
        description_ka = description
    else:
        description_ru = ""
        description_en = description
        description_ka = ""

    # Email
    try:
        email = re.findall(r'[\w\.-]+@[\w\.-]+', description)
        email = email[0]
    except:
        email = ""

    data = {
        "description_ka" : description_ka,
        "description_ru" : description_ru,
        "description_en" : description_en,
        "email" : email
    }
    return data

# Vacancy_info("https://jobs.ge/en/?view=jobs&id=268715")

示例#15

0

显示文件

 def get_reviews(self, item):
     url = Selector(text=item).xpath('.//a/@href').extract()
     res = ''
     if len(url):
         url = url[0]
         url = url.replace('../../','http://www.comparometer.in/')
         #res = requests.get(url)
         #data = Selector(text=res.text).xpath('//div[@class="col-sm-12"]/div[@class="col s4 reviewrating"]/img/@src').extract()
         #data2 = Selector(text=res.text).xpath('//div[@class="col-sm-12"]/div[@class="col s4 reviewrating"]/span/text()').extract()
         #data3 = Selector(text=res.text).xpath('//div[@class="col-sm-12"]/div[@class="col s4 reviewrating"]/a/@href').extract()
         #addup = list(zip(data,data2,data3))
         #review = " ".join(str(x) for x in addup)
     else:
         review = "NA"
     return url

示例#16

0

显示文件

文件： crawl.py 项目： stray-leone/crawlers

 def parse_name_and_birth(name_and_birth):
     #<h4>강기윤</h4>
     #    <ul>
     #      <li class="photo">
     #           <img src="/photo/9770703.jpg" alt="강기윤 의원사진" />
     #      </li>
     #      <li>姜起潤</li>
     #      <li>KANG Gi Yun</li>
     #      <li>1960-06-04</li>
     #   </ul>
     profile = get_xpath_data(page,".//*/div[@class='profile']")
     name_kr = get_xpath_data(profile, ".//*/h4/text()")
     name_cn = Selector(text=profile).xpath('.//*/li/text()')[2].extract()
     name_en = Selector(text=profile).xpath('.//*/li/text()')[3].extract()
     birth = Selector(text=profile).xpath('.//*/li/text()')[4].extract()
     return [name_kr, name_cn, name_en, birth.replace('.','-')]

示例#17

0

显示文件

    def parse(self, response):
        rows = response.xpath('//div[@class="m_content"]/ul/li[not(@class)]').extract()
        for isi in rows:
            link_page = Selector(text=isi).xpath('//div[@class="desc_nhl"]/a/@href').extract_first()
            clean_date = Selector(text=isi).xpath('//div[@class="desc_nhl"]/span[@class="labdate f11"]/text()').extract_first()
            clean_date = clean_date.replace("DETIKNEWS | ", "")
            item = {
                'headline' : Selector(text=isi).xpath('//article/div[@class="desc_nhl"]/a[@data-category="WP Kanal Jawatimur"]/h2/text()').extract_first(),
                'main_headline' : Selector(text=isi).xpath('//div[@class="desc_nhl"]/text()[4]').extract_first().strip(),
                'date' : clean_date,
                'url' : link_page
            }

            request = scrapy.Request(link_page, callback=self.parse_page2)
            request.meta['item'] = item
            yield request

示例#18

0

显示文件

文件： main.py 项目： tdiffendal/WaterBill

 def parseWaterBill(self, response):
     #Check if we found the water bill if not then write to the failed CSV and return.
     if (len(
             response.xpath(
                 "//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblCurrentBalance']"
             )) == 0):
         print("Couldn't find a water bill for account " +
               response.meta['account_or_address'])
         self.writeFailedCSV(response.meta['account_or_address'])
         return None
     #I use the item feature in scrapy to store the items.
     wateritem = WaterbillItem()
     wateritem['Searched_Address'] = response.meta[
         'search_type']  #This is a relic of when I searched by addresses.
     table = response.xpath('//table[@class="dataTable"]//tr')
     headers = [
         'Account Number', 'Service Address', 'Current Read Date',
         'Current Bill Date', 'Penalty Date', 'Current Bill Amount',
         'Previous Balance', 'Current Balance', 'Previous Read Date',
         'Last Pay Date', 'Last Pay Amount', 'TimeStamp'
     ]
     #I can't determine if this actually works because I can't find an address with a shut off notice.
     if (len(
             response.xpath(
                 "//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblTurnOffDate']"
             )) != 0):
         wateritem['TurnOffDate'] = "Yes"
         #wateritem['TurnOffDate'] = Selector(text=row.extract()).xpath("//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblTurnOffDate']").extract_first()
     else:
         wateritem['TurnOffDate'] = 'No'
     for row in table:
         header = Selector(
             text=row.extract()).xpath('//th/text()').extract_first()
         value = Selector(text=row.extract()).xpath(
             '//td/descendant::*/text()').extract_first()
         if value == None:
             value = ''  #So it populates the excel sheet with a blank spot
         if (header != None and header.strip().replace(':', "") in headers):
             value = value.replace('$', '').replace(",", '')
             if ("Date" in header and value != ''):
                 #Convert to SQL Datetime Format
                 value = datetime.strptime(value.strip(),
                                           '%m/%d/%Y').strftime('%Y-%m-%d')
             wateritem[header.strip().replace(':', "").replace(
                 ' ', '_')] = value.strip()
     wateritem['Timestamp'] = datetime.today().strftime('%Y-%m-%d')
     return wateritem

示例#19

0

显示文件

文件： foody_spider.py 项目： baopanda/Crawl_Foody_VN_News-..

    def parse_item(self, response):
        # body = response.body
        page = response.url.split("/")[-2]
        # sel = Selector(response)
        # hxs = HtmlXPathSelector(response)
        list_it = response.xpath(
            '//li[contains(@class,"review-item")]').extract()
        list_item = []
        for it in list_it:
            # item = FoodyItem()
            # content = response.xpath('//div[contains(@class,"review-des")]/div[contains(@class,"rd-des")]/span/text()').extract_first()
            content = Selector(text=it).xpath(
                '//div[contains(@class,"review-des")]/div[contains(@class,"rd-des")]/span/text()'
            ).extract_first()
            if content != None:
                content = content.replace("\n", " ")

            # point =response.xpath('//div[contains(@class,"review-des")]//div[contains(@class,"review-points")]/span/text()').extract_first()
            point = Selector(text=it).xpath(
                '//div[contains(@class,"review-des")]//div[contains(@class,"review-points")]/span/text()'
            ).extract_first()
            # print("############################################################")
            # # print(it)
            # print(content)
            # print("############################################################")
            # print(point)

            # list_item.append(point + content)

            if point != None and content != None:
                # list_item.append(point+ "\t" + content)
                list_item.append(point + "\t" + content)
                # yield self.parse_detail_item(it)

            yield scrapy.Request(it, self.parse_detail_item)

            # yield self.parse_detail_item(it)
            # yield SplashRequest(it, self.parse_item, endpoint='execute',
            #                     args={'lua_source': script_per_page})

        for a in list_item:
            print("###############################")
            print(a)

        with open("data2/{}.txt".format(page), "w", encoding='utf-8') as file:
            for i in list_item:
                file.write(i + "\n")

示例#20

0

显示文件

    def parse(self, response):
        # print('-----------------------------------------------')
        # print(response.url.split('=', 1)[1])
         # type
        type = '招标公告'
        if response.url.split('=', 1)[1]=='67':
            type = '中标公告'
        # print(type)

        category = response.xpath("/html/body/table[1]/tr/td[2]/table/tr[1]/td[@class='c_pt']/table/tr[1]/td[3]/span[@class='zt1']/text()").extract()[0]
        category = category.split('--', 3)[3]
        # print(category)


        currentPage  = response.xpath("/html/body/table[1]/tr/td[2]/table/tr[1]/td[@class='c_pt']/table/tr[2]/td/div[@class='zt3']/div[@class='pager']/strong/font/text()").extract()[0]
        nextPage     = int(currentPage)+1
        # print(currentPage)
        # print(nextPage)
        print('Nav'+response.url.split('=', 1)[1]+'###############################Page==='+currentPage+'===Page################################')

        totalPage    = response.xpath("/html/body/table[1]/tr/td[2]/table/tr[1]/td[@class='c_pt']/table/tr[2]/td/div[@class='zt3']/div[@class='pager']/a[text()='最后一页 »']/@href").extract()[0]
        totalPage    = totalPage.split('=', 2)[2]
        # print('total---'+totalPage)

        # /html/body/table[1]/tbody/tr/td[2]/table/tbody/tr[1]/td[@class='c_pt']/table/tbody/tr[2]/td/div[@class='zt3']/table[@id='node_list']/tbody/tr[@class='odd'][1]/td[1]/a
        # print('tr-------------------------------------->')
        for tr in response.xpath("/html/body/table[1]/tr/td[2]/table/tr[1]/td[@class='c_pt']/table/tr[2]/td/div[@class='zt3']/table[@id='node_list']/tbody/tr").extract():
            # print(tr)
            title = Selector(text=tr).xpath('//td[1]/a/text()').extract()[0]

            url = Selector(text=tr).xpath('//td[1]/a/@href').extract()[0]
            url = 'http://www.sxzfcg.cn/'+url
            issue_at = Selector(text=tr).xpath('//td[2]/text()').extract()[0]
            issue_at = issue_at.replace('[','')
            issue_at = issue_at.replace(']','')
            city = '山西'
            # print(title)
            # print(url)
            # print(issue_at)
            yield scrapy.Request(url, callback=self.parse_item,meta={"title":title,"type":type,"url":url,"issue_at":issue_at,"city":city,"category":category},dont_filter=True)

        next_url = response.url.split('=',2)[0]+'='+response.url.split('=',2)[1]+'&page='+str(nextPage)
        next_url = 'http://www.sxzfcg.cn/view.php?nav=61&page='+str(nextPage)

示例#21

0

显示文件

文件： judge.py 项目： mentu/juven_python

    def parse(self, response):

        docID = response.url[response.url.find("=") + 1:]
        docURL = '/content/content?DocID=' + docID
        if self.collection.find_one({"链接": response.url}) is not None:
            print "<<<<<<<<<<<Catch Duplicate<<<<<<<<<"
            return

        print "----------------------------- Cur -----------------------"
        self.cur += 1
        print response.url
        print self.cur

        item = Judge()
        item['case_name'] = ''.join(
            Selector(response).xpath(
                "//input[@id='hidCaseName']/@value").extract())
        item['case_num'] = ''.join(
            Selector(response).xpath(
                "//input[@id='hidCaseNumber']/@value").extract())
        item['url'] = response.url
        dic_string_unmod = Selector(response).xpath(
            "//input[@id='hidCaseInfo']/@value").extract()[0]
        dic_string = ''.join(dic_string_unmod.replace('null', 'None').split())
        item['case_info'] = eval(dic_string)['诉讼记录段原文']
        item['procedure'] = eval(dic_string)['审判程序']
        item['court'] = ''.join(
            Selector(response).xpath(
                "//input[@id='hidCourt']/@value").extract())
        item['company'] = self.advanced_filter[0]
        docID = Selector(response).xpath(
            "//input[@id='hidDocID']/@value").extract()
        doc_text_url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=" + ''.join(
            docID)
        doc_url = "http://wenshu.court.gov.cn/content/content?DocID=" + ''.join(
            docID)
        req = scrapy.Request(
            doc_text_url,
            callback=self.parse_doc,
            dont_filter=True,
            errback=lambda x: self.download_errback(x, doc_url))
        req.meta['foo'] = item
        yield req

示例#22

0

显示文件

文件： lottery.py 项目： wangjunping0938/hunter

    def parse(self, response):
        rules = Rules().parse
        block = response.xpath(rules['block']).extract()
        for b in block:
            item = LotteryItem()
            red_number = Selector(text=b).xpath(rules['red_number']).extract()
            blue_number = Selector(text=b).xpath(
                rules['blue_number']).extract()
            phase_number = Selector(text=b).xpath(
                rules['phase_number']).extract_first()
            note_number = Selector(text=b).xpath(
                rules['note_number']).extract_first()
            bonus = Selector(text=b).xpath(rules['bonus']).extract_first()

            item['red_number'] = ','.join(map(str, map(int, red_number)))
            item['blue_number'] = ','.join(map(str, map(int, blue_number)))
            item['phase_number'] = int(phase_number)
            item['note_number'] = int(note_number)
            item['bonus'] = int(float(bonus.replace(',', '')))
            yield item

示例#23

0

显示文件

文件： vacancy.py 项目： Caravan2/scripts

def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    print(url)
    # headers = {"Accept-Language": "en-US,en;q=0.5"}
    page = requests.get(url)  #headers=headers)

    # Published
    try:
        published = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[2]/span/text()[2]'
        ).get()
        published = published.strip().split(" ")
        publish_day = int(published[0].split("/")[0])
        publish_month = int(published[0].split("/")[1])
        publish_year = int(published[0].split("/")[2])
    except Exception as e:
        publish_day = 0
        publish_month = 0
        publish_year = 0
    if yesterday_day != publish_day or yesterday_month != publish_month:
        print("Not published yesterday")
        return

    # Location #
    try:
        location = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[1]/text()'
        ).get()
        location = location.strip()
        location_id = []
        location = {"city": f"{location}", "id": f"{Geonames(location)}"}
        location_id.append(location)
    except:
        location_id = [{'city': 'Yerevan', 'id': '616052'}]

    # Posted by
    try:
        posted_by = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[1]/text()'
        ).get()
        posted_by = posted_by.strip()
    except:
        posted_by = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[2]/text()'
        ).get()
        email = email.strip()
        if email == "":
            email = []
        else:
            email = [email]
    except:
        email = []

    # Workspace
    try:
        workspace = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[2]/div[2]/div[2]/p/text()'
        ).get()
        workspace = workspace.strip()
    except:
        workspace = ""

    # Job_type
    try:
        job_type = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[3]/div[2]/div[2]/p/text()'
        ).get()
        job_type = job_type.strip()
    except:
        job_type = ""

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[4]/div[2]/div[2]/p/text()'
        ).get()
        salary = salary.strip().replace("Until ", "")
        if "-" in salary:
            salary = salary.split("-")
            min_salary = int(salary[0].strip())
            max_salary = int(salary[1].strip())
        elif "-" not in salary and salary != '':
            min_salary = int(salary)
            max_salary = int(salary)
        else:
            min_salary = 0
            max_salary = 0
    except:
        min_salary = 0
        max_salary = 0

    # Education
    try:
        education = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[5]/div[2]/div[2]/p/text()'
        ).get()
        education = education.strip()
    except:
        education = ""

    # Experience
    try:
        experience = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[6]/div[2]/div[2]/p/text()'
        ).get()
        experience = experience.strip()
    except:
        experience = ""

    # Gender
    try:
        gender = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[7]/div[2]/div[2]/p/i/@class'
        ).get()
        if "female" in gender:
            gender = "female"
        elif "male" in gender:
            gender = "male"
        else:
            gender = ''
    except:
        gender = ""

    # Age
    try:
        age = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[8]/div[2]/div[2]/p/text()'
        ).get()
        age = age.strip()
    except:
        age = ""

    print(1)

    # Description
    try:
        description = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[10]/div[2]/div/p/text()'
        ).get()
        description = description.strip()
    except:
        description = ""
    description_en = ""
    description_am = ""
    try:
        if detect(description) == "et":
            try:
                description_en = Translate(description)
            except:
                description_en = ""
            description_am = description
        else:
            description_en = description
            description_am = ""
    except:
        description_en = ""
        description_am = ""

    # Phone
    try:
        phone = Selector(response=page).css(
            '#sidebar-border > div.detailed-info-block.form-inline.clearfix > div.clearfix > div > div.user-details'
        ).extract()
        phones = []
        for phone in phone:
            phone = remove_tags(phone).strip()
            area_code = "374"
            number = phone.replace(" ", "")
            number = number.replace("-", "")
            number = number.replace("(", "")
            number = number.replace(")", "")
            phones.append({'country_code': area_code, "number": number})
    except:
        phone = []

    # Username
    try:
        username = Selector(response=page).xpath(
            '//*[@id="sidebar-border"]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div[2]/h6/a/text()'
        ).get()
        username = username.strip()
    except:
        username = ""

    data = {
        "publish_day": publish_day,
        "publish_month": publish_month,
        "publish_year": publish_year,
        "location_id": location_id,
        "posted_by": posted_by,
        "email": email,
        "workspace": workspace,
        "job_type": job_type,
        "min_salary": min_salary,
        "max_salary": max_salary,
        "education": education,
        "experience": experience,
        "gender": gender,
        "age": age,
        "description_am": description_am,
        "description_en": description_en,
        "phone": phones,
        "username": username
    }

    print(data)
    return data


# Vacancy("https://full.am/en/job/public/view/1163")

# https://full.am/en/job/public/view/12067
# https://full.am/en/job/public/view/1163

示例#24

0

显示文件

文件： app.py 项目： mrvaghani/customer_feedback_crawler

    def parse(self, response):
        detailed_review_object_list = []
        review_selector_list = response.xpath(
            '//div[@id="reviews-container"]//div[@class="js-paginator-data"]'
        ).xpath('//div[@class="rvw js-rvw"]')
        for _review_selector in review_selector_list:
            _current_review_selector_body = _review_selector.get()
            # _review_rating = _review_selector.xpath('//div[@class="rvw__hdr-stat"]//img/@data-rating').get()  # '5.0'
            _review_rating = Selector(
                text=_current_review_selector_body).xpath(
                    '//div[@class="rvw__hdr-stat"]//img/@data-rating').get()

            # _author_info = _review_selector.xpath('//div[@class="rvw-aut__inf"]/strong/text()').get()  # 'Julie of Ceres,, CA'
            _author_info = Selector(text=_current_review_selector_body).xpath(
                '//div[@class="rvw-aut__inf"]/strong/text()').get()

            _author_state: str = _author_info.split(',')[-1]  # 'CA'

            # _review_date_text = _review_selector.xpath('//div[@class="rvw-bd ca-txt-bd-2"]/span/text()').get()  #'Original review: March 18, 2019'
            _review_date_text = Selector(
                text=_current_review_selector_body).xpath(
                    '//div[@class="rvw-bd ca-txt-bd-2"]/span/text()').get(
                    ).split(':')[-1]

            # Let's remove whitespace to make it easier to convert to datetime object
            _review_date_text = _review_date_text.replace(' ', '')
            # _review_date_text = 'March18,2019'
            _review_date_text = _review_date_text[-4:]
            # _date_pattern = '%b.%d,%Y'  # 'Oct.21,2019'
            _date_pattern = '%Y'  # '2019'
            _struct_time_format = (time.strptime(_review_date_text,
                                                 _date_pattern))
            _date_time_format = datetime.datetime(*_struct_time_format[:6])
            eastern = pytz.timezone('US/Eastern')
            utc = pytz.utc
            aware_date_time = eastern.localize(_date_time_format)
            utc_review_date_time = aware_date_time.astimezone(utc).timestamp()

            # This will be the list of all paragraphs that we find in a review that we will be using to process.
            _review_description_paragraph_list: list = Selector(
                text=_current_review_selector_body).xpath(
                    '//div[@class="rvw-bd ca-txt-bd-2"]/p').getall()
            _clean_review_description_list: list = []

            # Let's check if there is a collapsed div that we need to process.
            if Selector(text=_current_review_selector_body).xpath(
                    '//div[@class="rvw-bd ca-txt-bd-2"]/div[@class="js-collapsed"]'
            ).get() is not None:

                # We need to get all the paragraphs in the collapsed div that we found
                _collapsed_paragraph_list = Selector(
                    text=_current_review_selector_body
                ).xpath(
                    '//div[@class="rvw-bd ca-txt-bd-2"]/div[@class="js-collapsed"]/p'
                ).getall()

                # Let's add these new paragraphs to our original list for processing
                _review_description_paragraph_list.extend(
                    _collapsed_paragraph_list)

            for para in _review_description_paragraph_list:
                if Selector(text=para).xpath('//p/text()').get(
                ) is not None:  # If the paragraph is not empty
                    _clean_review_description_list.append(
                        Selector(text=para).xpath('//p/text()').get())

            _clean_review_description = ''.join(_clean_review_description_list)
            _num_found_useful_text: str = Selector(
                text=_current_review_selector_body
            ).xpath(
                '//div[@class="rvw-foot"]/span[@class="rvw-foot__helpful-count js-helpful-count ca-txt--clr-gray"]/strong/text()'
            ).get()

            # We need to extract the number from the text we get from _num_found_useful_text --> E.g. '97 people'
            _num_found_useful: str = _num_found_useful_text.split(' ')[0]

            detailed_review_object = {
                'ratings': _review_rating,
                'reviewer_location': _author_state,
                'review_time_utc': str(utc_review_date_time),
                'review_description': _clean_review_description,
                'num_found_useful': _num_found_useful
            }
            detailed_review_object_list.append(detailed_review_object)

        _return_data = {'reviews': detailed_review_object_list}
        return _return_data

示例#25

0

显示文件

文件： app.py 项目： Caravan2/scripts

                # Company
                try:
                    company = Selector(response=page).xpath(
                        f'//*[@id="affiliations-list"]/tbody/tr[{tr}]/td[1]/a/text()'
                    ).get()
                    company = company.strip()
                except Exception as e:
                    company = e

                # Role
                try:
                    role = Selector(response=page).xpath(
                        f'//*[@id="affiliations-list"]/tbody/tr[{tr}]/td[2]/text()'
                    ).get()
                    role = role.strip()
                    role = role.replace("\n", "")
                    role = re.sub(' +', ' ', role)
                except:
                    role = ""

                # Date
                try:
                    starting_from = Selector(response=page).xpath(
                        f'//*[@id="affiliations-list"]/tbody/tr[{tr}]/td[3]/text()'
                    ).get()
                    starting_from = starting_from.strip()
                except:
                    starting_from = ""

                # Documentation
                try:

示例#26

0

显示文件

            publish_day = int(published[0].split(" ")[1])
            publish_month = int(months[published[0].split(" ")[0]])
        except:
            publish_year = 0
            publish_day = 0
            publish_month = 0
        if yesterday_day != publish_day or yesterday_month != publish_month:
            print("Not published yesterday")
            continue

        # Ends
        try:
            ends = Selector(response=page).xpath(
                f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[2]/p/span[4]/time/span[2]/text()'
            ).get()
            ends = ends.replace("-", "").strip()
            ends = ends.strip().split(",")
            deadline_year = int(ends[1].strip())
            deadline_day = int(ends[0].split(" ")[1])
            deadline_month = int(months[ends[0].split(" ")[0]])
        except:
            deadline_year = 0
            deadline_day = 0
            deadline_month = 0

        # Logo
        try:
            logo = Selector(response=page).xpath(
                f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[1]/a/img/@src'
            ).get()
        except:

示例#27

0

显示文件

文件： manager.py 项目： ildap/parser

    def kinopars(self, name, year, content):
        content = content.decode('cp1251')
        content = content.encode('utf8')
        content = self.resub.sub('', content)

        def check(obj):
            if obj:
                return obj[0]
            else:
                return ''

        xpath = '//link[@rel="canonical"]/@href'
        id = Selector(text=content).xpath(xpath).extract()
        if id:
            id = self.renum.findall(id[0])[0]
        xpath = '//div[@class="brand_words"][@itemprop="description"]/text()'
        text = check(Selector(text=content).xpath(xpath).extract())
        text = self.req.sub('', text)

        xpath = '//span[@class="rating_ball"]/text()'
        rating = check(Selector(text=content).xpath(xpath).extract())
        if text and rating and id:
            print 'was found on kinopoisk.ru/film', id
            xpath = '//a[@class="popupBigImage"]/img/@src'
            poster = check(Selector(text=content).xpath(xpath).extract())
            if poster == 'http://st.kp.yandex.net/images/movies/poster_none.png':
                poster = u'false'
            else:
                poster = u'true'

            xpath = '//span[@class="ratingCount"]/text()'
            count = Selector(text=content).xpath(xpath).extract()

            if count:
                count = count[0]
                count = count.replace(u'\xa0', u'')
            else:
                count = 0;
            print 'rating', rating, count,
            xpath = '//td[@class="time"]/text()'
            time = Selector(text=content).xpath(xpath).extract()
            nulltime = '0:0'
            if len(time) > 1:
                time = self.retime.findall(time[1])
                if len(time) >= 1:
                    time = time[0]
                else:
                    time = nulltime

            elif len(time) == 1:
                time = self.renum.findall(time[0])
                if len(time) >= 1:
                    time = int(time[0])
                    th = time / 60
                    tm = time - (th * 60)
                    time = str(th) + ':' + str(tm)
                else:
                    time = nulltime
            else:
                time = nulltime

            print 'time', time,

            xpath = '//div[@id="block_rating"]/div[1]/div[2]/text()'
            imdb = check(Selector(text=content).xpath(xpath).extract())

            if imdb:
                imdb = float(self.rescfl.findall(imdb)[0])
            else:
                imdb = 0;
            print 'imdb:', imdb
            head = '(name,year,text,rating,count,imdb,time,kinopoiskid,poster)'
            values = (name.encode('utf8'), year, text.encode('utf8'), rating, count, imdb, time, id, poster)
            fid = self.db.insert('ruparser_film', head, values)
            return fid

示例#28

0

显示文件

文件： vacancy.py 项目： Caravan2/scripts

def Vacancy_info(url):
    print(url)
    page = requests.get(url)

    # Description
    try:
        description = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[4]').get()
        description = remove_tags(description)
        description = description.rstrip()
        description = description.lstrip()
        description = re.sub(r"\s+", " ", description)
        print(description)
    except:
        description = ""
    if detect(description) == "ru":
        description_ru = description
        description_en = Translate(description)
        description_ka = ""
    elif detect(description) == "et":
        description_ru = ""
        try:
            description_en = Translate(description)
        except:
            description_en = ""
        description_ka = description
    else:
        description_ru = ""
        description_en = description
        description_ka = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[2]/div[2]/div/div/a/@href'
        ).get()
        email = email.replace("mailto:", "")
    except:
        email = ""

    # Location
    try:
        location = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[1]/div[2]/span/text()'
        ).get()
        location_id = []
        try:
            location_id.append({
                "city": f"{location}",
                "id": f"{Geonames(location)}"
            })
        except:
            location_id.append({"city": f"{location}", "id": "611717"})
    except:
        location_id = [{"city": "Tbilisi", "id": "611717"}]

    # Category
    try:
        category = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[2]/div[2]/span[1]/text()'
        ).get()
    except:
        category = ""

    # Stack
    try:
        stack = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[4]/div[2]/text()'
        ).get()
        if "სრული განაკვეთი" in stack:
            stack = "Full-Stack"
    except:
        stack = ""

    data = {
        "description_en": description_en,
        "description_ka": description_ka,
        "description_ru": description_ru,
        "email": email,
        "location": location_id,
        "category": category,
        "stack": stack
    }

    print("Vacancy Scraped Succesfully")
    return data

示例#29

0

显示文件

文件： news_spider.py 项目： chaman1avnish/hackernews_clone

    def parse(self, response):
        description = response.xpath(
            "//table[@class='itemlist']/tr[not(re:test(@class, "
            "'(spacer)'))]").extract()
        row = self.get_default_row_dict()
        # print description
        for i, v in enumerate(description):
            index = i
            if not row['rank']:
                value = Selector(text=v).xpath(
                    '//td[1]/span[@class="rank"]/text()').extract_first()
                row['rank'] = int(value.replace('.', '')) if value else 0

            if not row['story_text']:
                value = Selector(text=v).xpath(
                    '//td[3]/a[@class="storylink"]/text()').extract_first()
                row['story_text'] = value.encode("utf8") if value else ''

            if not row['link_href']:
                value = Selector(text=v).xpath(
                    '//td[3]/a[@class="storylink"]/@href').extract_first()
                # print value
                row['link_href'] = value if value else ''

            if not row['hn_user']:
                value = Selector(text=v).xpath(
                    '//a[@class="hnuser"]/text()').extract_first()
                row['hn_user'] = value.encode("utf8") if value else ''

            if not row['age']:
                value = Selector(text=v).xpath(
                    '//span[@class="age"]/a/text()').extract_first()
                row['age'] = int(value.split(' ')[0]) if value else 0

            if not row['total_comments']:
                value = Selector(text=v).xpath(
                    '//td[@class="subtext"]/a[contains(@href, "item?id=")]/text()'
                ).extract_first()
                if value:
                    value = value.encode('ascii', 'ignore').replace(
                        'comments', '') if value else ''
                    value = value.encode('ascii', 'ignore').replace(
                        'comment', '') if value else ''
                    row['total_comments'] = int(value) if represents_int(
                        value) else 0

            if not row['score']:
                value = Selector(text=v).xpath(
                    '//span[@class="score"]/text()').extract_first()
                row['score'] = int(value.split(' ')[0]) if value else 0

            if not row['hn_id_code']:
                value = Selector(
                    text=v).xpath('//tr[@class="athing"]/@id').extract_first()
                row['hn_id_code'] = int(value) if represents_int(value) else 0

            if all([None for i, v in row.items() if v == None]):
                print 'Go for save >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'
                data = row.copy()
                row = self.get_default_row_dict()
                self.comment_url.append(
                    'https://news.ycombinator.com/item?id=15318440')
                news_id = data['hn_id_code']
                item = NewsBotItem(data)
                yield item
                request = scrapy.Request(
                    url='https://news.ycombinator.com/item?id=' + str(news_id),
                    callback=self.parse_comment)
                request.meta['item'] = item
                request.meta['news_id'] = int(news_id)
                yield request

            if index % 2:
                row = self.get_default_row_dict()

示例#30

0

显示文件

文件： myjob.py 项目： Caravan2/scripts

                        f'//*[@id="MainContentPlaceHolder_jobPageContainer"]/a[{div}]/div/div[2]/div/text()'
                    ).get()
                    location = location.split(",")[0]
                    location_id = [{
                        "city": f"{location}",
                        "id": f"{Geonames(location)}"
                    }]
                except:
                    location_id = [{'city': 'Yerevan', 'id': '616052'}]

                # Publication
                try:
                    published = Selector(response=page).xpath(
                        f'//*[@id="MainContentPlaceHolder_jobPageContainer"]/a[{div}]/div/div[1]/div[3]/text()'
                    ).get()
                    published = published.replace("Published on ",
                                                  "").split("/")
                    publish_day = int(published[0])
                    publish_month = int(published[1])
                    publish_year = int(published[2])
                except:
                    publish_day = 0
                    publish_month = 0
                    publish_year = 0
                if publish_day != yesterday_day:
                    continue

# //*[@id="MainContentPlaceHolder_jobPageContainer"]/a[1]/div/div[1]/div[2]
# //*[@id="MainContentPlaceHolder_jobPageContainer"]/a[2]/div/div[1]/div[2]

                data = {
                    "company": company,

示例#31

0

显示文件

文件： bia.py 项目： Caravan2/scripts

def BiaFunction(company):
    driver.get(f"https://www.bia.ge/EN")

    driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(
        f"{company}")
    time.sleep(3)
    try:
        link = driver.find_element_by_xpath(
            '/html/body/div[8]/div[2]').get_attribute('data-url')

        page = requests.get(link)

        # Company name
        name = Selector(response=page).xpath(
            '//*[@id="TrademarksListBox"]/li/text()').get()

        # Vat number
        vat_number = Selector(response=page).xpath(
            '//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]/text()'
        ).get()

        # Address
        try:
            address = Selector(response=page).xpath(
                '//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]/text()'
            ).get()
            raw = address.split(",")
            postal_code = raw[0]
            location = raw[1]
            location = location.lstrip()
            region = raw[2]
            appartment = raw[3]
            city_id = Geonames(location)
            address = {
                "location": {
                    "country": "GE",
                    "city": {
                        "id": f"{city_id}",
                        "city": location
                    }
                },
                "postal_code": postal_code,
                "appartament": appartment,
                "region": region
            }
        except Exception as e:
            print(e)
            address = {}

        # Working hours
        try:
            working_hours = Selector(response=page).xpath(
                '//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li/text()'
            ).get()
            raw = working_hours.split(":", 1)
            days = raw[0].split("-")
            till = days[1].lstrip().lower()
            days = []
            for day in weekdays:
                if day != till:
                    days.append(day)
                else:
                    days.append(day)
                    break

            hourfrom = raw[1].split("-")[0]
            hourfrom = hourfrom.lstrip()
            hourfrom = hourfrom.rstrip()

            hourto = raw[1].split("-")[1]
            hourto = hourto.lstrip()
            hourto = hourto.rstrip()
            business_hours = {
                "week_days": days,
                "hour_from": hourfrom,
                "hour_to": hourto
            }
        except:
            business_hours = {}

        # Foundation Date
        foundation_date = Selector(response=page).xpath(
            '//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]/text()'
        ).get()

        # Phone
        try:
            phone = Selector(response=page).xpath(
                '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get()
            phone = remove_tags(phone)
            if "," in phone:
                array = phone.split(",")
                phone = []
                for each in array:
                    each = each.lstrip()
                    each = each.rstrip()
                    each = each.split(" ", 1)
                    code = each[0]
                    code = code.replace("+", "")
                    number = each[1]
                    number = number.replace(" ", "")
                    phone.append({"country_code": code, "number": number})
            else:
                phone = phone.lstrip()
                add = phone.rstrip()
                add = add.split(" ", 1)
                code = add[0]
                code = code.replace("+", "")
                number = add[1]
                number = number.replace(" ", "")
                phone = [{"country_code": code, "number": number}]
        except:
            phone = []

        # Web
        try:
            web = Selector(response=page).xpath(
                '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get()
            web = remove_tags(web)
            if "," in web:
                array = web.split(",")
                web = []
                for each in array:
                    each = each.lstrip()
                    each = each.rstrip()
                    web.append(each)
            else:
                web = web.lstrip()
                add = web.rstrip()
                web = [add]
        except:
            web = []

        # Email
        try:
            email = Selector(
                response=page).xpath('//*[@id="TabPanelBox"]').get()
            email = email.replace("*****@*****.**", "")
            email = re.findall(r'[\w\.-]+@[\w\.-]+', email)
        except:
            email = []

        info = {
            "name": name,
            "vat": vat_number,
            "addresses": address,
            "business_hours": business_hours,
            "phones": phone,
            "websites": web,
            "emails": email,
            "foundation_date": foundation_date
        }
        print("Bia Scraped Successfully")
        # print(info)
        return info
    except:
        print("No info")
        return "No info"

    # driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(Keys.RETURN)

    # try:
    #     logo = driver.find_element_by_id('LogoImageUploaderBox').get_attribute("style")
    # except:
    #     logo = ""
    # print(logo)

    # try:
    #     name = driver.find_element_by_id('CompanyNameBox').text
    # except:
    #     name = ""
    # print(name)

    # try:
    #     trademarks = driver.find_element_by_xpath('//*[@id="TrademarksListBox"]/li').text
    # except:
    #     trademarks = ""
    # print(trademarks)

    # try:
    #     legal_form = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[1]/span[2]').text
    # except:
    #     legal_form = ""
    # print(legal_form)

    # try:
    #     registration_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[1]/span[2]').text
    # except:
    #     registration_number = ""
    # print(registration_number)

    # try:
    #     registration_authority = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[1]/span[2]').text
    # except:
    #     registration_authority = ""
    # print(registration_authority)

    # try:
    #     status = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[1]/span[2]').text
    # except:
    #     status = ""
    # print(status)

    # try:
    #     brands = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[1]/td[2]/span[2]').text
    # except:
    #     brands = ""
    # print(brands)

    # try:
    #     vat_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]').text
    # except:
    #     vat_number = ""
    # print(vat_number)

    # try:
    #     registration_date = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]').text
    # except:
    #     registration_date = ""
    # print(registration_date)

    # try:
    #     legal_address = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]').text
    # except:
    #     legal_address = ""
    # print(legal_address)

    # try:
    #     working_hours = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li').text
    # except:
    #     working_hours = ""
    # print(working_hours)

    # try:
    #     phone = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').text
    # except:
    #     phone = ""
    # print(phone)

    # try:
    #     website = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').text
    # except:
    #     website = ""
    # print(website)

    # x = mycol.insert_one({
    #     "Name": name,
    #     "Logo": logo,
    #     "Trademarks": trademarks,
    #     "Legal_Form": legal_form,
    #     "Registration_Number": registration_number,
    #     "Registration_Authority": registration_authority,
    #     "Status": status,
    #     "Brands": brands,
    #     "VAT_Number": vat_number,
    #     "Registration_Date": registration_date,
    #     "Legal_Address": legal_address,
    #     "Working_Hours": working_hours,
    #     "Phone": phone,
    #     "Website": website
    # })

    # driver.find_element_by_xpath('').text


# driver.find_element_by_xpath('').text

# //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span/a
# //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span

示例#32

0

显示文件

文件： app.py 项目： Caravan2/scripts

                    for each in array:
                        each = each.lstrip()
                        each = each.rstrip()
                        web.append(each)
                else:
                    web = web.lstrip()
                    add = web.rstrip()
                    web = [add]
            except:
                web = []

            # Email
            try:
                email = Selector(
                    response=page).xpath('//*[@id="TabPanelBox"]').get()
                email = email.replace("*****@*****.**", "")
                email = re.findall(r'[\w\.-]+@[\w\.-]+', email)
            except:
                email = []

            # Logo
            try:
                logo = Selector(response=page).xpath(
                    '//*[@id="LogoImageUploaderBox"]').get()
                logo = logo.split("url(\'")
                logo = logo[1].split("')")
                logo = logo[0]
            except:
                logo = ""

            info = {

示例#33

0

显示文件

    def parse(self, response):

        productList = Selector(text=response.body).xpath(
            '//li[contains(@class, "gl-item")]').extract()

        # $object = UPLOAD_PATH.$new_path.md5(time().mt_rand(100, 999999999)).
        # '.'.pathinfo($file->getInfo('name'), PATHINFO_EXTENSION);
        # $new_path = 'goods'.date('Y').'/'.date('m-d').'/';

        Class = Selector(text=response.body).xpath(
            '//div[contains(@class, "p-name p-name-type-2")]//em[not(i)]'
        ).extract()
        print(Class)

        for item in productList:
            if self.num > self.getNum:
                break
            name = Selector(text=item).xpath(
                '//div[contains(@class, "p-name")]/a/em').extract()[0]
            name = filterStr.filter_tags(name)
            skuid = Selector(text=item).xpath('//li/@data-sku').extract()[0]
            price = Selector(text=item).xpath(
                '//div[contains(@class, "p-price")]/strong/i').extract()[0]
            price = filterStr.filter_tags(price)
            imgsrc = Selector(text=item).xpath(
                '//li[contains(@class, "gl-item")]//img/@src').extract()[0]
            imgsrc = imgsrc.replace('//', '')

            # 去除京东超市
            # '京东超市金龙鱼 食用油 葵花籽清香型 食用植物调和油5L（新老包装随机发货）'
            name = name.replace("京东超市", "")
            name = name.replace("（京东定制）", "")
            name = name.replace("（京东定制装）", "")
            name = name.replace("京东自营", "")
            name = name.replace("（新老包装随机发货）", "")
            name = name.replace("新旧包装随机配送", "")
            name = name.replace("新老包装随机发放", "")
            name = name.replace("（新老包装随机发放，数量有限，赠完为止）", "")
            name = name.replace("中粮出品", "")
            name = name.replace("（中粮出品）", "")
            if "【沃尔玛】" in name:
                continue
            name = name.replace("【沃尔玛】", "")
            self.item['name'] = name.strip()
            self.item['price'] = price
            self.item['skuid'] = skuid
            # self.item['Class'] = Class
            self.item['imgsrc'] = imgsrc
            self.item['sourceType'] = SOURCE_TYPE_JD
            self.item['goods_id'] = self.insertGoods(self.item)
            self.num = self.num + 1

            yield self.item

示例#34

0

显示文件

def Vacancy(link):
    url = link
    headers = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9,ru;q=0.8"
    }
    page = requests.get(url, headers=headers)

    # Company
    try:
        company = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/h4/text()').get()
    except:
        company = ""

    # position
    try:
        position = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[2]/div/div[1]/h4/text()').get()
    except:
        position = ""

    # logo
    try:
        logo = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/img/@src').get()
    except:
        logo = ""

    # Job_type
    try:
        job_type = Selector(response=page).xpath(
            '/html/body/div[3]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]//text()[2]'
        ).get()
        job_type = job_type.strip()
    except:
        job_type = ""

    # Contact Person
    try:
        person = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[2]').get(
            )
        person = person.strip()
    except:
        person = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[3]').get(
            )
        email = email.strip()
        email = [email]
    except:
        email = []

    # Phone
    try:
        phone = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[4]').get(
            )
        phone = phone.strip()
        if "," in phone:
            phones = phone.split(",")
            phone = []
            for each in phones:
                each = each.strip()
                if "+" in each and " " in each:
                    number = each.split(" ",
                                        1)[1].replace('-',
                                                      "").replace(" ", "")
                    country_code = each.split(" ", 1)[0].replace('+', "")
                    phone.append({
                        "country_code": country_code,
                        "number": number
                    })
                elif "+" in each and " " not in each:
                    if "+374" in each:
                        country_code = "374"
                        number = each.replace("+374", "")
                        phone.append({
                            "country_code": country_code,
                            "number": number
                        })
                    elif "+1" in each:
                        country_code = "1"
                        number = each.replace("+1", "")
                        phone.append({
                            "country_code": country_code,
                            "number": number
                        })
                    else:
                        country_code = "374"
                        number = each
                        phone.append({
                            "country_code": country_code,
                            "number": number
                        })
                elif "+" not in each:
                    number = each.replace('-', "").replace(" ", "")
                    country_code = "374"
                    phone.append({
                        "country_code": country_code,
                        "number": number
                    })
        else:
            if "+" in phone and " " in phone:
                number = phone.split(" ", 1)[1].replace('-',
                                                        "").replace(" ", "")
                country_code = phone.split(" ", 1)[0].replace('+', "")
                phone = [{"country_code": country_code, "number": number}]
            elif "+" in phone and " " not in phone:
                if "+374" in phone:
                    country_code = "374"
                    number = phone.replace("+374", "")
                    phone = [{"country_code": country_code, "number": number}]
                elif "+1" in phone:
                    country_code = "1"
                    number = phone.replace("+1", "")
                    phone = [{"country_code": country_code, "number": number}]
                else:
                    country_code = "374"
                    number = phone
                    phone = [{"country_code": country_code, "number": number}]
            elif "+" not in phone:
                number = phone.replace('-', "").replace(" ", "")
                country_code = "374"
                phone = [{"country_code": country_code, "number": number}]

    except Exception as e:
        phone = []

    # Website
    try:
        website = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[5]').get(
            )
        website = website.strip()
        if "not" in website:
            website = []
        else:
            website = [website]
    except:
        website = []

    # Published
    try:
        published = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[2]').get()
        published = published.strip()
        publish_day = int(published.split("-")[2])
        publish_month = int(published.split("-")[1])
        publish_year = int(published.split("-")[0])
    except:
        publish_day = 0
        publish_month = 0
        publish_year = 0

    # Ends
    try:
        ends = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[5]').get()
        ends = ends.strip()
        deadline_day = int(ends.split("-")[2])
        deadline_month = int(ends.split("-")[1])
        deadline_year = int(ends.split("-")[0])
    except:
        deadline_day = 0
        deadline_month = 0
        deadline_year = 0

    # Career Level
    try:
        career_level = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[1]/text()').get(
            )
        if career_level == None:
            career_level = ""
    except:
        career_level = ""

    # Education
    try:
        education = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[2]/text()').get(
            )
        if education == None:
            education = ""
    except:
        education = ""

    # Experience
    try:
        experience = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[3]/text()').get(
            )
        if experience == None:
            experience = ""
    except:
        experience = ""

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/strong/text()').get(
            )
        if "-" in salary:
            salary = salary.split("-")
            min_salary = salary[0].strip()
            min_salary = int(min_salary.replace(".", ""))
            max_salary = salary[1].strip()
            max_salary = int(max_salary.replace('.', ""))
        elif "-" not in salary and salary != "N/A":
            min_salary = int(salary.replace("."))
            max_salary = int(salary.replace("."))
        else:
            min_salary = 0
            max_salary = 0
    except:
        min_salary = 0
        max_salary = 0

    # Vacancy Description
    try:
        v_description = Selector(
            response=page).xpath('//*[@id="loyal"]/div[2]/div/div[1]').get()
        v_description = remove_tags(v_description).strip()
        v_description = v_description.replace('\xa0', " ")
    except:
        v_description = ""
    try:
        if detect(v_description) == "et":
            try:
                v_description_en = Translate(v_description)
            except:
                v_description_en = " "
            v_description_am = v_description
        else:
            v_description_en = v_description
            v_description_am = ""
    except:
        v_description_am = ""
        v_description_en = ""

    # Company Description
    try:
        c_description = Selector(response=page).xpath(
            '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()').get()
        c_description = c_description.strip()
    except:
        c_description = ""
    try:
        if detect(c_description) == "et":
            try:
                c_description_en = Translate(c_description)
            except:
                c_description_en = " "
            c_description_am = c_description
        else:
            c_description_en = c_description
            c_description_am = ""
    except:
        c_description_am = ""
        c_description_en = ""
# c_descrip ; //*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()

    data = {
        "company": company,
        "position": position,
        "logo": logo,
        "person": person,
        "job_type": job_type,
        "email": email,
        "phone": phone,
        "website": website,
        "publish_day": publish_day,
        "publish_month": publish_month,
        "publish_year": publish_year,
        "deadline_day": deadline_day,
        "deadline_month": deadline_month,
        "deadline_year": deadline_year,
        "career_level": career_level,
        "education": education,
        "experience": experience,
        "min_salary": min_salary,
        "max_salary": max_salary,
        "v_description_am": v_description_am,
        "v_description_en": v_description_en,
        "c_description_am": c_description_am,
        "c_description_en": c_description_en,
    }

    print(data)
    return data


# Vacancy("https://rezume.am/job/2184")