def parse_modal_infor(self,response):
     res = response.text
     result = response.meta.get("item")
     res_convert = json.loads(res)
     res_convert = res_convert['spec']
     res_convert_del_header = re.sub(r'<li><label>([a-zA-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềếểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳýỵỷỹ\s\.&,.-])+<\/label><\/li>',"",res_convert)
     res_key = remove_tags_with_content(res_convert_del_header, which_ones=('div',))
     res_key_replace_tags = replace_tags(res_key,'|','utf-8')
     res_key_array = list()
     res_key_gen = (value for value in res_key_replace_tags.split("||||"))
     for val in res_key_gen:
         res_key_array.append(val.replace("||",""))
     res_val = remove_tags_with_content(res_convert_del_header,which_ones=('span',))
     res_val_remove_tags = remove_tags(res_val,which_ones = ('a','li',))  
     res_val = replace_tags(res_val_remove_tags,'|','utf-8')
     res_val = res_val.split("||")
     res_val_array = list()
     res_val_gen = (val for val in res_val)
     for val in res_val_gen:
         res_val_array.append(val.replace("|",""))
     res_modal = dict(zip(res_key_array,res_val_array))
     result['data'] = res_modal
     # spec = Selector(text=json.loads(response.text)['spec'])
     # spec_values_container = list(filter(lambda x: len(x.xpath('./@class')) != 0, spec.css('li')))
     # spec_values_dict_keys = [ x.xpath('./span/text()').get() if x.xpath('./span/text()').get().split() else x.xpath('./span/div/text()').get() for x in spec_values_container ]
     # spec_values_dict_values = [remove_tags(x.xpath('./div').get()) for x in spec_values_container]
     yield result
Пример #2
0
 def test_replace_tags(self):
     self.assertEqual(
         replace_tags("This text contains <a>some tag</a>"),
         "This text contains some tag",
     )
     self.assertEqual(
         replace_tags(b"This text is very im<b>port</b>ant", " "),
         "This text is very im port ant",
     )
Пример #3
0
    def test_replace_tags(self):
        # make sure it always return uncode
        assert isinstance(replace_tags("no entities"), unicode)

        self.assertEqual(replace_tags(u"This text contains <a>some tag</a>"), u"This text contains some tag")

        self.assertEqual(replace_tags("This text is very im<b>port</b>ant", " "), u"This text is very im port ant")

        # multiline tags
        self.assertEqual(replace_tags('Click <a class="one"\r\n href="url">here</a>'), u"Click here")
Пример #4
0
    def test_replace_tags(self):
        # make sure it always return uncode
        assert isinstance(replace_tags('no entities'), unicode)

        self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
                         u'This text contains some tag')

        self.assertEqual(
            replace_tags('This text is very im<b>port</b>ant', ' '),
            u'This text is very im port ant')

        # multiline tags
        self.assertEqual(
            replace_tags('Click <a class="one"\r\n href="url">here</a>'),
            u'Click here')
Пример #5
0
def normalize_web_content(x, keep=('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong'),
                          token='____SECTION____'):
    """Normalize web content.

    Parameters
    ----------
    keep : tuple
        HTML tags to keep.
    token : str or None
        Token to use for replacing kep HTML tags.
        Do not replace if `None`.
    """
    try:
        x = strip_html5_whitespace(x)
        x = remove_comments(x)
        x = remove_tags(x, keep=keep)
        if token:
            x = replace_tags(x, token=token)
        x = replace_entities(x)
        x = replace_escape_chars(x)
    except (TypeError, AttributeError):
        pass
    for part in _rx_web_sectionize.split(x):
        if part:
            yield part
Пример #6
0
    def parse(self, response):
        articles = response.css('article')

        for article in articles:
            title = article.css('.entry-title > a::text').extract_first()
            date_published = article.css(
                '.entry-date.published::text').extract_first()
            link = article.xpath(
                './/*[@class="more-link"]/@href').extract_first()

            if 'category-video' in article.attrib['class']:
                content_dirty = article.css('.entry-content p').extract_first()
                content = remove_tags(content_dirty).replace('|',
                                                             ' | ').replace(
                                                                 '\n', ' ')
            else:
                content_dirty = article.xpath(
                    './/*[@class="entry-content"]').extract_first()
                content = replace_tags(replace_escape_chars(content_dirty),
                                       '  ').replace('(more…)', '').replace(
                                           'Read More', '').strip()

            yield {
                'Title': title,
                'Published On': date_published,
                'Content': content,
                'Link': link,
            }

        next_page = response.xpath(
            './/a[contains(@class, "next")]/@href').extract_first()
        yield Request(next_page)
    def process_item(self, item, spider):
        """处理item"""
        user_name = item.get('user_name', '')
        user_gender = item.get('user_gender', '')
        user_age = item.get('user_age', 0)
        vote_count = item.get('vote_count', 0)
        comment_count = item.get('comment_count', 0)
        god_comment = item.get('god_comment', '')
        content = item.get('content', '')
        url = item.get('url', '')

        # 更改性别显示方式
        user_gender = '男' if user_gender == 'M' else '女'
        # 去除html标签
        content = replace_tags(content, token='\n').strip()

        print('#' * 30)
        if user_name != '匿名用户':
            print('{} {} {}:'.format(user_name, user_gender, user_age))
        else:
            print('匿名用户:')
        print('正文: {}'.format(content))
        if god_comment != '':
            print('神评: {}'.format(god_comment))
        print('#' * 30 + '\n')
Пример #8
0
    def _parse_rooms_by_js(self, rooms_in_script):
        rooms = []
        for room_item in rooms_in_script:
            additional_info = room_item.get('additionalInfo', {})
            # amenities = additional_info.get('details', {}) \
            #     .get('amenities', [])
            # size = [
            #     remove_tags(amenity['description'])
            #     for amenity in amenities
            #     if amenity['type'] == 'room-size'
            # ]
            size = ''
            if additional_info['description'].startswith('<strong>'):
                size = additional_info['description'].split('</strong>', 1)[0]
                size = size[8:]
            bed_type_and_occupancy = room_item.get('bedTypeAndOccupancy', {})  # noqa
            bed_types = bed_type_and_occupancy.get('bedTypes', [])
            bed_extra_types = bed_type_and_occupancy.get('extraBeds', [])
            images = [
                {'type': image.get('caption'), 'url': image.get('fullSizeUrl')}
                for image in room_item.get('images', [])
            ]
            description = list(
                filter(
                    lambda item: item != '' and item != '&nbsp;',
                    replace_tags(
                        remove_tags(
                            additional_info.get('description', ''),
                            keep=('br',)
                        ),
                        '\n'
                    ).split('\n')
                )
            )
            occupancy = room_item.get('maxOccupancy')
            room = {
                'name': remove_tags(room_item['name']),
                'size': len(size) > 0 and size[0] or None,
                'occupancy': ''.join([occupancy.get('messageTotal'), occupancy.get('messageChildren')]),   # noqa
                'bed_types': bed_types,
                'bed_metrics': bed_type_and_occupancy.get('bedTypesTooltipMessage'),   # noqa
                'bed_types_str': bed_type_and_occupancy.get('localisedName'),
                'bed_extra_types': bed_extra_types,
                'description': description,
                'amenities': list(map(remove_tags, additional_info.get('details', {}).get('amenities', []))),  # noqa
                'images': images,
                'room_type_code': room_item['ratePlans'][0]['payment']['book']['bookingParamsMixedRatePlan']['roomTypeCode']  # noqa
            }

            rooms.append(room)
        return rooms
Пример #9
0
 def on_update(self, status):
     """A new status has appeared! 'status' is the parsed JSON dictionary
     describing the status."""
     # tcp_connection = tcp()
     # print(status)
     json_toot = status
     try:
         if status['language'] in ['en', 'fr', 'None', 'es', 'de']:
             toot_text = replace_entities(replace_tags(json_toot['content']))
             print("Toot Text: " + toot_text)
             print("------------------------------------------")
             message = toot_text + '\n'
             tcp_connection.sendto(message.encode('utf-8'),("localhost", 9009))
     except:
         e = sys.exc_info()[0]
         print("Error: %s" % e)
Пример #10
0
    def on_update(self, status):
        """A new status has appeared! 'status' is the parsed JSON dictionary
        describing the status."""
        # print(status)
        # tcp_connection = tcp()
        json_toot = status
        try:
            if status['language'] in ['en', 'fr', 'None', 'es', 'de']:
                toot_text = replace_entities(replace_tags(json_toot['content']))
                stopwords_combined = stopwords.words('english') + stopwords.words('french') + stopwords.words('spanish') \
                                     + stopwords.words('german')
                wordsc = [w for w in toot_text.split(" ") if w.lower() not in stopwords_combined]
                print("Toot Text: " + toot_text)
                print(wordsc)
                print("------------------------------------------")


                message = toot_text + '\n'
                # tcp_connection.send(toot_text + '\n')
        except:
            e = sys.exc_info()[0]
            print("Error: %s" % e)
Пример #11
0
 def format_bedsize(self, bedsize):
     if not bedsize:
         return ''
     return replace_tags(bedsize, '\n')
Пример #12
0
 def test_replace_tags_multiline(self):
     self.assertEqual(replace_tags(b'Click <a class="one"\r\n href="url">here</a>'),
                      u'Click here')
Пример #13
0
 def test_replace_tags(self):
     self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
                      u'This text contains some tag')
     self.assertEqual(replace_tags(b'This text is very im<b>port</b>ant', ' '),
                      u'This text is very im port ant')
Пример #14
0
 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(replace_tags(b'no entities'), six.text_type)
     assert isinstance(replace_tags('no entities'), six.text_type)
Пример #15
0
 def format_bedsize(self, bedsize):
     if not bedsize:
         return ""
     return replace_tags(bedsize, "\n")
Пример #16
0
 def process_value(self, value):
     return " ".join(replace_tags(value).strip().split())
Пример #17
0
 def test_replace_tags_multiline(self):
     self.assertEqual(
         replace_tags(b'Click <a class="one"\r\n href="url">here</a>'),
         'Click here')
Пример #18
0
 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(replace_tags(b'no entities'), str)
     assert isinstance(replace_tags('no entities'), str)
Пример #19
0
 def process_value(self, value):
     return " ".join(replace_tags(value).strip().split())
Пример #20
0
    def _parse(self):  # pylint: disable=R0912,R0915
        result = self._order
        hotel_name = strip_str(take_first(self._etree, marriott_xp.HOTEL_NAME))
        if hotel_name:
            result['hotel_name'] = hotel_name
        else:
            self.logger.error('hotel_name is empty %s', self._message_id)

        address = strip_str(take_first(self._etree, marriott_xp.ADDRESS))
        if address:
            result['address'] = address
        else:
            self.logger.error('address is empty %s', self._message_id)

        tel = take_first(self._etree, marriott_xp.PHONE)
        if tel:
            result['telephone'] = tel
        else:
            self.logger.error('telephone is empty %s', self._message_id)

        confirm_num = take_first(self._etree, marriott_xp.CONFIRM_NUM)
        if confirm_num:
            confirm_num = confirm_num.split(': ')[-1]
            result['confirm_code'] = confirm_num
        else:
            self.logger.error('confirm_number is empty %s', self._message_id)

        guest = take_first(self._etree, marriott_xp.GUEST)
        if guest:
            guest = guest.split('For ')[-1]
            result['guest_name'] = guest
        else:
            self.logger.error('guest_name is empty %s', self._message_id)

        check_in_out_time = strip_list(
            self._etree.xpath(marriott_xp.CHECK_IN_OUT_TIME))
        if len(check_in_out_time) == 2:
            result['check_in_time'] = check_in_out_time[0]
            result['check_out_time'] = check_in_out_time[1]
        else:
            self.logger.error('check_in_date and check_out_time is empty %s',
                              self._message_id)
        check_in_out_date = strip_list(
            self._etree.xpath(marriott_xp.CHECK_IN_OUT_DATE))
        check_in_date, check_out_date = unpack(check_in_out_date)
        if check_in_date and check_out_date:
            result['check_in_date'] = check_in_date
            result['check_out_date'] = check_out_date
            tz = to_timezone(address)
            check_in_date_formatted = \
                DateTime(check_in_date, 'MMMM DD, YYYY').tz_to_datetime(tz)
            if check_in_date_formatted:
                result['check_in_date_formatted'] = check_in_date_formatted
            else:
                self.logger.error('check_in_date_formatted is empty %s',
                                  self._message_id)
            check_out_date_formatted = \
                DateTime(check_out_date, 'MMMM DD, YYYY').tz_to_datetime(tz)
            if check_out_date_formatted:
                result['check_out_date_formatted'] = check_out_date_formatted
            else:
                self.logger.error('check_out_date_formatted is empty %s',
                                  self._message_id)
        else:
            self.logger.error('check_in_date and check_out_date is empty %s',
                              self._message_id)

        related_links = self._etree.xpath(marriott_xp.RELATED_LINK)
        related_text = self._etree.xpath(marriott_xp.RELATED_TEXT)
        related_links = to_dict(related_text, related_links)
        if related_links:
            related = []
            for i in related_links:
                if 'Hotel Website' in i.get('name'):
                    result['hotel_link'] = i.get('value')
                    related_links.remove(i)
                elif 'Map & Directions' in i.get('name'):
                    result['map_link'] = i.get('value')
                    related_links.remove(i)
                elif 'Cancel' in i.get('name'):
                    result['cancellation_link'] = i.get('value')
                else:
                    related.append(i)
            if related:
                result['related_links'] = related_links
        else:
            self.logger.error('related_links is empty %s', self._message_id)

        room_type = take_first(self._etree, marriott_xp.ROOM_TYPE)
        room_type_value = take_first(self._etree, marriott_xp.ROOM_TYPE_VALUE)
        if room_type and room_type_value:
            result['room_type'] = room_type_value
        else:
            self.logger.error('room_type is empty %s', self._message_id)
        room_num_guest = strip_list(
            self._etree.xpath(marriott_xp.ROOM_NUM_GUEST))
        room_num_guest_name, room_num_guest_value = group(room_num_guest)
        if room_num_guest_name and room_num_guest_value:
            for i, j in zip(room_num_guest_name, room_num_guest_value):
                if 'NUMBER OF ROOMS' in i:
                    result['number_of_rooms'] = j
                elif 'GUESTS PER ROOM' in i:
                    continue
                else:
                    self.logger.warning('%s is %s %s', i, j, self._message_id)
        else:
            self.logger.error('room number is empty %s', self._message_id)

        guarantee = strip_list(self._etree.xpath(
            marriott_xp.GUARANTEED_METHOD))
        if guarantee:
            result['guarantee_policies'] = [guarantee[-1]]
        else:
            self.logger.error('guarantee is empty %s', self._message_id)
        price_des = strip_str(
            take_first(self._etree, marriott_xp.CHARGE_DESCRIPTION))
        if price_des:
            result['price_tips'] = [price_des]
        else:
            self.logger.warning('price_description is empty %s',
                                self._message_id)

        notice = strip_list(self._etree.xpath(marriott_xp.HOTEL_ALERT))
        if notice:
            result['notice'] = notice
        else:
            self.logger.error('notice is empty %s', self._message_id)

        rates = strip_list(self._etree.xpath(marriott_xp.RATES))
        if rates:
            rates_type = rates.pop()
            if 'Best Available rate' in rates_type:
                nights = sum([int(i.split(' ')[0]) for i in rates[1::3]])
                price = sum([float(i.split(' ')[0]) for i in rates[2::3]])
                currency = ' ' + rates[2].split(' ')[-1]
                result['price'] = str(round(price / nights, 2)) + currency
            else:
                result['price'] = rates[-1]
                self.logger.error('price is empty %s', self._message_id)
        else:
            self.logger.error('rates is empty %s', self._message_id)
        taxes = strip_list(self._etree.xpath(marriott_xp.TAXES))
        name, value = unpack(taxes)
        if name and value:
            if 'TAXES & FEES' in name:
                result['taxes_fee'] = taxes[-1]
            else:
                self.logger.error('%s is %s %s', name, value, self._message_id)
        else:
            self.logger.error('taxes is empty %s', self._message_id)
        total = strip_list(self._etree.xpath(marriott_xp.TOTAL))
        name, value = unpack(total)
        if total:
            if 'Total' in name:
                result['total_cost'] = total[-1]
            else:
                self.logger.error('%s is %s %s', name, value, self._message_id)
        else:
            self.logger.error('total_price is empty %s', self._message_id)
        other_charge = strip_list(self._etree.xpath(marriott_xp.OTHER_CHARGE))
        other_charge = [
            i for i in other_charge if i != '\u2022' and i != 'Other Charges'
        ]
        if other_charge:
            result['other_charges'] = other_charge
        else:
            self.logger.warning('other_charge is empty %s', self._message_id)

        cancellation = take_first(self._etree,
                                  marriott_xp.RATE_CANCELLATION_DETAILS)
        if cancellation is not None:
            cancellation = replace_tags(etree.tostring(cancellation))
            cancellation = cancellation.split('&#8226; \n')
            result['cancellation_policies'] = strip_list(cancellation)
        else:
            self.logger.error('cancellation_policy is empty %s',
                              self._message_id)

        rate_guarantee_title = take_first(self._etree,
                                          marriott_xp.RATE_GUARANTEE_TITLE)
        rate_guarantee = strip_list(
            self._etree.xpath(marriott_xp.RATE_GUARANTEE))
        rate_guarantee = [i for i in rate_guarantee if i != '\u2022']
        if rate_guarantee and 'GUARANTEE' in rate_guarantee_title:
            guarantee = result.get('guarantee_policies')
            if guarantee:
                result['guarantee_policies'].extend(rate_guarantee)
            else:
                result['guarantee_policies'] = rate_guarantee
        else:
            self.logger.error('rate guarantee is empty %s', self._message_id)

        addition_title = take_first(self._etree,
                                    marriott_xp.ADDITION_INFO_TITLE)
        addition_link = strip_list(
            self._etree.xpath(marriott_xp.ADDITION_INFO_LINK))
        addition_text = strip_list(
            self._etree.xpath(marriott_xp.ADDITION_INFO_TEXT))
        if addition_text and 'ADDITIONAL' in addition_title:
            result['additional_information'] = to_dict(addition_text,
                                                       addition_link)
        else:
            self.logger.error('additional information is empty %s',
                              self._message_id)

        contact_links = self._etree.xpath(marriott_xp.CONTACT_LINK)
        contact_texts = strip_list(self._etree.xpath(marriott_xp.CONTACT_TEXT))
        contact = strip_list(self._etree.xpath(marriott_xp.CONTACT_1))
        contact_1 = [{'name': i, 'value': i} for i in contact]
        contact = chain(to_dict(contact_texts, contact_links), contact_1)
        contact = filter_dict_value(contact)
        if contact:
            result['contact_information'] = contact
        return result
Пример #21
0
 def test_replace_tags(self):
     self.assertEqual(replace_tags('This text contains <a>some tag</a>'),
                      'This text contains some tag')
     self.assertEqual(
         replace_tags(b'This text is very im<b>port</b>ant', ' '),
         'This text is very im port ant')
Пример #22
0
def _cleanup(value):
    return " ".join(replace_entities(replace_tags(value)).strip().split())
Пример #23
0
def get_financial_blob(sel):
    financial_html_blob = sel.xpath(
        '//div[@id="financial-details-wrapper"]').get()
    return replace_tags(financial_html_blob, "\n")
Пример #24
0
def _cleanup(value):
    return " ".join(replace_entities(replace_tags(value)).strip().split())