def _split_entry_text(self, text): """Takes main entry text and returns tuple with title part and a part containing information about dates & times. """ if '\n' in text: parts = text.split('\n') title = parts[0] for info in parts[1:]: dates = self.entry_split_price_re.split(info, maxsplit=1)[0] yield clean_whitespace(title), clean_whitespace(dates) else: title, info = self.entry_split_re.split(text, maxsplit=1) dates = self.entry_split_price_re.split(info, maxsplit=1)[0] yield clean_whitespace(title), clean_whitespace(dates)
def _extract_entry_text(self, entry): """Extracts relevant entry text from given STRONG element and it's siblings (sometimes film entry actually consists of multiple STRONG elements as someone made the text bold by selecting multiple parts of it and pushing the button in WYSIWYG editor). """ title_text = self._extract_entry_siblings_text(entry, 'previous') title_text += (entry.text_content(whitespace=True) or '') title_text += self._extract_entry_siblings_text(entry, 'next') details_text = self._extract_entry_tail_text(entry) return title_text.strip(), clean_whitespace(details_text)
def text_content(self, whitespace=False): """Returns text content, by default with normalized whitespace.""" if whitespace: # add newline after every <br> for br in self.xpath('.//br'): if br.tail: br.tail = '\n' + br.tail else: br.tail = '\n' # get the text text = super(HTMLElement, self).text_content() # remove added newlines for br in self.xpath('.//br'): br.tail = br.tail[1:] # provide the text return text # replace all whitespace with single spaces text = super(HTMLElement, self).text_content() return clean_whitespace(text)
def text_content(self, whitespace=False): """Returns text content, by default with normalized whitespace.""" text = super(HTMLElement, self).text_content() if whitespace: return text return clean_whitespace(text)