def paragraphs(self): if not self.debugging: warnings.warn('Debugging mode has to be True when call the class') return None list_paragraphs_soup = self.soup.find_all(name='p') # re.compile( list_paragraphs = [] for item in list_paragraphs_soup: if len(tl.convert_to_text(item.get_text())) != 0: item.string = tl.convert_to_text(item.get_text()) list_paragraphs.append(item.get_text()) return list_paragraphs
def get_keywords(self, rules): keywords = [] for rule in rules: for keyword in self.soup.find_all(**rule): keywords.append(tl.convert_to_text(keyword.get_text())) keyword.extract() return keywords
def get_first_title(self, rules): for rule in rules: for title_tag in self.soup.find_all(**rule): title = tl.convert_to_text(title_tag.get_text()) title_tag.extract() return title return None
def get(self, rules): results = list() for rule in rules: finds = self.soup.find_all(**rule) for item in finds: text = tl.convert_to_text(item.get_text()) results.append(text) item.extract() return results
def headings(self): if not self.debugging: warnings.warn('Debugging mode has to be True when call the class') return None list_heading_soup = self.soup.find_all(name=re.compile('^h[1-6]$')) list_heading = [] for item in list_heading_soup: list_heading.append(tl.convert_to_text(item.get_text())) return list_heading
def _deal_default(self): #print('DEFAULT') ParserSections.number_paragraphs += len( list(self.content.find_all('p'))) txt_paragraph = tl.convert_to_text( re.sub('(?<!\.)\\n', '', self.content.get_text())) if self.content_section is None: self._create_section() warnings.warn(" Section with no name - _deal_default " + "the name was defined as no_name_section") if txt_paragraph != '': self.content_section[self.i]['content'].append(txt_paragraph)
def _deal_para(self): if self.content_section is None: self._create_section() warnings.warn(" Section with no name - deal_para " + "the name was defined as no_name_section") ParserSections.number_paragraphs += 1 txt_paragraph = tl.convert_to_text( re.sub('(?<!\.)\\n', '', self.content.get_text())) #print('The paragraph is', txt_paragraph) if txt_paragraph != '' or txt_paragraph is None: #print('We add it to the content_section') self.content_section[self.i]['content'].append(txt_paragraph)
def span(self): import copy if not self.debugging: warnings.warn('Debugging mode has to be True when call the class') return None soup_one = copy.copy(self.soup) find_one1 = soup_one.find_all(name=re.compile('^h[1-6]$')) for e in find_one1: e.extract() find_one = soup_one.find_all(name=re.compile('span|p'), limit=1) list_paragraphs = [] while len(find_one) != 0: text = tl.convert_to_text(find_one[0].get_text()) if (find_one[0].name is not None) and (len(text) != 0): list_paragraphs.append(text) find_one[0].extract() find_one = soup_one.find_all(name=re.compile('span|p'), limit=1) return list_paragraphs
def __init__(self, soup, parameters, debugging=False, parser_type='lxml', new=False): # parser_types = ['xml.parser', 'lxml', 'xml5lib', 'lxml-xml'] self.parser_type = parser_type self.soup = bs4.BeautifulSoup(repr(soup), parser_type) self.soup1 = list(self.soup.children) self.new = new if len(self.soup1) != 1: #self.save_soup_to_file('some_thing_wrong_children.xml') warnings.warn(' Something is wrong in children!=1') exit() self.soup1 = self.soup1[0] self.parameters = parameters if debugging: self.save_soup_to_file('ParseXML_initial.xml', prettify=True) self.sub_section_name = 'section_h' self.paragraphs = list() self.content_section = [] self.data = list() #print(self.soup1) #print('The find_all returns',len(self.soup1.find_all(**parameters)),'results.') self.i = 0 # for i,item in enumerate(self.soup1.find_all(**parameters)): # print('=== item #',i,'==') # print(item.name) # for child in item: # print('->',child.name) for i, item in enumerate(self.soup1.find_all(**parameters)): #print('===',i,'====') #print(item.name) if item.name is not None: if self.sub_section_name in item.name: #print('IT\'S A HEADING!') self.content_section.append({ 'type': item.name, 'name': re.sub('(?<!\.)\\n', '', item.section_title.get_text()), 'content': [] }) #print('Creation of self.content_section:',self.content_section) for content in item.contents: #print('===') #print(content.name) self.content = content if self.content.name is not None: # deal with empty content if self.sub_section_name in self.content.name: # standard sections <section_h#> #print('>>> It\'s a sub-heading. <<<') self._create_sub_division() content.extract() else: #print('>>> It\'s not a sub-heading <<<') self._deal() content.extract() self.i += 1 self.data = self.content_section lost_section = { 'type': 'lost_content', 'name': 'lost_content', 'content': [] } save_lost = False tags_lost = self.soup.find_all() for tag in tags_lost: text1 = tl.convert_to_text(re.sub('(?<!\.)\\n', '', tag.get_text())) if len(text1) > 0: save_lost = True lost_section['content'].append(text1) tag.extract() text1 = tl.convert_to_text( re.sub('(?<!\.)\\n', '', self.soup1.get_text())) if len(text1) > 0: save_lost = True lost_section['content'].append(text1) if save_lost: self.data.append(lost_section)