class HTMLSequenceWrapperRecord(object): def __init__(self, element, url, mintextlen=10): self.cleaner = SimpleHTMLCleaner() self.mintextlen = mintextlen self.elem = element self.url = url # the whole text self.text = self.elem.text_content() self.text = self.cleaner.clean(self.text) self.chunks = [] self.__extract_chunks(self.elem) def has_value(self): if self.cleaner.contains_text(self.text) == False: return False return len(self.text) > self.mintextlen def get_chunks(self): return self.chunks def get_text(self): return self.text def _handle_elem(self, elem): if elem.text == None: return None if not self.cleaner.contains_text(elem): return None # new chunk chunk = TextChunk() ## extracting links if elem.get('href') != None: chunk.set_link(elem.get('href')) # extracting 'title' atribute in anchor if elem.tag == 'a' and elem.get('title') != None: chunk.set_comment(elem.get('title')) # extracting text txt = elem.text_content() chunk.set_text(self.cleaner.clean(txt)) # setting style fs = elem.style chunk.set_style(fs) chunk.set_tag(elem.tag) return chunk def __extract_chunks(self, elem): thischunk = self._handle_elem(elem) if thischunk != None: self.chunks.append(thischunk) for child in elem.iterchildren(): self.__extract_chunks(child) def __str__(self): return "<" + __modulename__ + ".HTMLSequenceWrapperRecord instance " + self.text + " >"
def __init__(self, element, url, mintextlen=10): self.cleaner = SimpleHTMLCleaner() self.mintextlen = mintextlen self.elem = element self.url = url # the whole text self.text = self.elem.text_content() self.text = self.cleaner.clean(self.text) self.chunks = [] self.__extract_chunks(self.elem)
def __init__(self, childcoef=7.0, headercoef=4.0, mintextlen=10, omitted_tags=('option', 'br', 'select', 'form')): self.sequences = {} self.childcoef = childcoef self.headercoef = headercoef self.mintextlen = mintextlen self.omitted_tags = omitted_tags self.records = [] self.cleaner = SimpleHTMLCleaner()
def extract_data(self, etree, url): """ Extract all possible data about the publication from the web page. @param etree - parsed DOM tree of the web page (has to be instance of lxml.etree._ElementTree) @param url - url of the web page @return RRSPublication object containing extracted data """ assert isinstance(url, basestring) assert isinstance(etree, _ElementTree) #c = Cleaner(scripts=True, javascript=True, comments=True, style=False, # meta=False, page_structure=False, processing_instructions=True, # embedded=True, frames=False, forms=True, annoying_tags=False, # add_nofollow=False, remove_unknown_tags=False) #etree = c.clean_html(etree) self.url = url self.domain = re.sub("http://(www)?", "", self.url).split(".")[0] self._storage= {} self._publ = RRSPublication() cleaned_etree = SimpleHTMLCleaner.clean_html(etree) page = HTMLDocument(cleaned_etree, url) self.pagetext = page.get_etree().getroot().text_content() # parse CSS and metadata on the page page.parse_document() # get data from <meta> tags nad convert to RRS format self._parse_meta(page) # get data on the basis of the text visbility and recognized headers self._parse_visibility(page) # and now guess :) self._find_unbound_entities(page) # and parse BibTeX self._parse_bibtex(page) return self._publ
def _find_abstract(self, etree): c = Cleaner(scripts=True, javascript=True, comments=True, style=True, meta=True, page_structure=False, processing_instructions=True, embedded=True, frames=False, forms=True, annoying_tags=True, add_nofollow=False, remove_unknown_tags=False) etree_copy = deepcopy(etree) etree_copy = c.clean_html(etree_copy) html = tostring(etree_copy.getroot()) # XXX this may be probably useful, to delete all <p> tags... html = re.sub("</?p[^>]*>", " ", html) possible = [] txts = re.findall("(?<=\>)[^>]+(?=\<)", html, re.U) for txt in txts: txt = SimpleHTMLCleaner.clean(txt) if len(txt) > 200: do_not_append = False for bl in self._abstract_blacklist: if txt.startswith(bl): do_not_append = True break if not do_not_append: possible.append(txt) continue for st in self._abstract_startswith: if txt.startswith(st): possible.append(txt) break return self._get_longest_string(possible)
def extract_data(self, etree, url): """ Extract all possible data about the publication from the web page. @param etree - parsed DOM tree of the web page (has to be instance of lxml.etree._ElementTree) @param url - url of the web page @return RRSPublication object containing extracted data """ assert isinstance(url, basestring) assert isinstance(etree, _ElementTree) #c = Cleaner(scripts=True, javascript=True, comments=True, style=False, # meta=False, page_structure=False, processing_instructions=True, # embedded=True, frames=False, forms=True, annoying_tags=False, # add_nofollow=False, remove_unknown_tags=False) #etree = c.clean_html(etree) self.url = url self.domain = re.sub("http://(www)?", "", self.url).split(".")[0] self._storage = {} self._publ = RRSPublication() cleaned_etree = SimpleHTMLCleaner.clean_html(etree) page = HTMLDocument(cleaned_etree, url) self.pagetext = page.get_etree().getroot().text_content() # parse CSS and metadata on the page page.parse_document() # get data from <meta> tags nad convert to RRS format self._parse_meta(page) # get data on the basis of the text visbility and recognized headers self._parse_visibility(page) # and now guess :) self._find_unbound_entities(page) # and parse BibTeX self._parse_bibtex(page) return self._publ
def generalize(self, term): # preprocessing term = term.lower() term = re.sub("[\"\'0-9]+", "", term) term = re.sub("[_:\-\.\,]+", " ", term) term = SimpleHTMLCleaner.clean(term) # if it is lemma, its OK if self.is_lemma(term): return term # if it isnt lemma, do lookup try: index = self.term2lemma[term] return self.lemmas[index] except KeyError: return None
class HTMLSequenceWrapper(object): """ HTMLSequenceWrapper is an itelligent system for pattern and repeating sequence recognition on web pages. Input of this algorithm is element tree object (lxml.etree._ElementTree) and output is instance of ParsedHTMLDocument. The sequencewrapper parses element tree to get most valuable repeating sequence which is supposed to be data record. It also finds out regions. """ # list of important terms to get menu. _menu = ('[CK]onta[ck]t', 'Publi[ck]', 'Blog', 'Links', 'About', 'Home', 'News?', \ 'Event', 'Research', 'Index', 'FAQ', 'People', 'Overview', 'Profile', \ 'Community', 'Download') # this list prolly shouldnt be here, but in some higher class what uses # HTMLSequenceWrapper to get page structure and semantics _semantic_tags = { 'dfn': 'Definition Term', # <dfn> 'address': 'Address', # <address> 'em': 'Emphasis', # <em> 'strong': 'Strong Text', # <strong> 'ins': 'Inserted', # <ins> 'del': 'Delete', # <del> 'cite': 'Citation', # <cite> 'code': 'Computer code text', # <code> 'samp': 'Sample computer code text', # <samp> 'kbd': 'Keyboard text', # <kbd> 'var': 'Variable' } # <var> def __init__(self, childcoef=7.0, headercoef=4.0, mintextlen=10, omitted_tags=('option', 'br', 'select', 'form')): self.sequences = {} self.childcoef = childcoef self.headercoef = headercoef self.mintextlen = mintextlen self.omitted_tags = omitted_tags self.records = [] self.cleaner = SimpleHTMLCleaner() def _append(self, elem, depth): if str(elem.tag) == '<built-in function Comment>': return key = elem.tag + "_" + str(depth) if not key in self.sequences: self.sequences[key] = [elem] else: self.sequences[key].append(elem) def _recurse(self, elem, depth): self._append(elem, depth) for child in elem.iterchildren(): self._recurse(child, depth + 1) def _get_most_freq(self, seqdict, position=1): reversed_entries = {} for k in seqdict: reversed_entries[len(seqdict[k])] = seqdict[k] ordered = sorted(reversed_entries.keys(), reverse=True) # FILTERING TAGS # filter non-usable tags like <option>, <br> or <form> for i in range(len(ordered)): mf = reversed_entries[ordered[(position - 1) + i]] if mf[0].tag not in self.omitted_tags: break return mf def _find_nearest_parent(self, elems): parents = {} for elem in elems: parent = elem.getparent() if parent == None: continue if parent.tag not in parents: parents[parent.tag] = [parent] else: if not parent in parents[parent.tag]: parents[parent.tag].append(parent) mf = self._get_most_freq(parents) #del parents return mf def _isbodyelem(self, elem): return elem.tag != None and elem.tag == 'body' def _sift(self, elems): sift = True while sift: parents = self._find_nearest_parent(elems) if self._isbodyelem(parents[0]): break sift = len(elems) < self.childcoef * len(parents) if sift: elems = parents self.sifted_first = elems[0] # improve speed by converting list to set try: return set(elems) except MemoryError: return elems def _find_regions(self): # delete previously found data self.regions = [] area = HTMLSequenceWrapperRegion() for elem in self.elemtree.getroot().iterdescendants(): _style = elem.style if _style is None: _style = CSSStyle() # we consider it to be a header if visibility self.headercoef if _style.get_visibility() >= self.headercoef: if not area.is_empty(): self.regions.append(area) area = HTMLSequenceWrapperRegion() area.set_name(self.cleaner.clean(elem.text)) area.set_header_style(_style) if elem in self.found_entries: rec = HTMLSequenceWrapperRecord(elem, self.url, self.mintextlen) if not rec.has_value(): continue area.add_record(rec) if not area.is_empty(): self.regions.append(area) def _find_menu(self, elemtree): _anchors = self.elemtree.findall('.//a[@href]') menuanchors = [] for a in _anchors: if a.text != None: for menuitem in HTMLSequenceWrapper._menu: if re.search(menuitem, a.text, re.I): menuanchors.append(a) break if not menuanchors: return # sift the menu with a different child coeficient coef = self.childcoef self.childcoef = 3.0 _menuitems = self._sift(menuanchors) self.childcoef = coef # get closest parent for all navigation items menu_reg = self._find_nearest_parent(_menuitems) _links = menu_reg[0].findall('.//a[@href]') for tag in _links: if tag == None: continue text = self.cleaner.clean(tag.text) if text == None: continue # bad heuristics, isn't it? if len(text) > 50: self.menu = {} return self.doc.add_menu_item(text, tag.get('href')) #--------------------------------------------------------------------------- ## checking unbalanced - wrap_h() methods #--------------------------------------------------------------------------- def _unbalanced_chunk_to_record_ratio(self): chunks, records = 0, 0 for reg in self.regions: for rec in reg._manual_process_page(): records += 1 chunks += len(rec.get_chunks()) try: return float(chunks) / float(records) < self.unbalanced_chunk_ratio except ZeroDivisionError: return True def _unbalanced_record_to_region_ratio(self): try: return (float(sum([len(reg._manual_process_page()) for reg in self.regions])) / \ float(len(self.regions))) < self.unbalanced_record_ratio except ZeroDivisionError: return True def _high_variablilty_of_chunk_count(self): chunks = [] for reg in self.regions: for rec in reg._manual_process_page(): if sum(chunks) == 0 or len(chunks) == 0 or \ len(rec.get_chunks()) > 3*(float(sum(chunks))/float(len(chunks))): chunks.append(len(rec.get_chunks())) aver = float(sum(chunks)) / float(len(chunks)) base = 0 for x in chunks: base += (x - aver)**2 base /= len(chunks) return base > aver #--------------------------------------------------------------------------- # Public methods #--------------------------------------------------------------------------- def wrap_h(self, elemtree, url): """ Heuristical version of wrap() method. Warning: this method doesnt produce 100% correct result!! And also this method runs longer than wrap() cause of repeating parsing sequences. TODO: consider clustering methods. """ if not isinstance(elemtree, etree._ElementTree): raise TypeError( "ElementTree has to be type lxml.etree._ElementTree") self.url = url self.doc = ParsedHTMLDocument(elemtree, url) # parse html document, css on page and in extern *.css files # this also makes all links absolute self.doc.parse_document() # store element tree self.elemtree = self.doc.get_etree() # recurse over tree self._recurse(self.elemtree.getroot(), 1) # get most frequented tag mf = self._get_most_freq(self.sequences) # learn satisfying_result_found = False # setting up average values of coeficients self.childcoef = 7.0 self.headercoef = 4.0 self.mintextlen = 40 self.unbalanced_chunk_ratio = 2.0 self.unbalanced_record_ratio = 3.0 iterations = 0 while not satisfying_result_found: iterations += 1 if iterations > 100: break # push it up to get parent tags, they could be record-keepers self.found_entries = self._sift(mf) # find data regions self._find_regions() # if we found only one region with one record, its probably a mistake # so we have to decrease childcoef if len(self.regions) == 1 and len(self.regions[0].records) == 1: self.childcoef -= 1.5 self.headercoef -= 1.0 elif self._unbalanced_chunk_to_record_ratio(): self.childcoef += 2.0 self.headercoef += 0.5 self.unbalanced_chunk_ratio -= 0.2 elif self._unbalanced_record_to_region_ratio(): self.headercoef += 1.0 self.childcoef -= 0.5 self.unbalanced_record_ratio -= 0.4 elif self._high_variablilty_of_chunk_count(): self.childcoef += 1.0 self.mintextlen += 10 else: satisfying_result_found = True # find navigation on page self._find_menu(self.elemtree) for reg in self.regions: self.doc.add_region(reg) #remember last url self.last_url = url # return parsed document return self.doc def wrap(self, elemtree, url): """ Main method. Parses html page and searches for repeated sequences in element tree. Returns instance of HTMLDocument. """ if not isinstance(elemtree, etree._ElementTree): raise TypeError( "ElementTree has to be type lxml.etree._ElementTree") self.url = url self.doc = ParsedHTMLDocument(elemtree, url) # parse html document, css on page and in extern *.css files # this also makes all links absolute self.doc.parse_document() # store element tree self.elemtree = self.doc.get_etree() # recurse over tree self._recurse(self.elemtree.getroot(), 1) # get most frequented tag mf = self._get_most_freq(self.sequences) # push it up to get parent tags, they could be record-keepers self.found_entries = self._sift(mf) # find data regions self._find_regions() # find navigation on page self._find_menu(self.elemtree) for reg in self.regions: self.doc.add_region(reg) try: self.doc.set_name(self.elemtree.find('.//title').text) except AttributeError: pass #remember last url self.last_url = url # return parsed document return self.doc def _make_xml(self): """ Constructs xml tree containing result of wrapping. """ self.xmldocument = etree.Element("document") self.xmldocument.set("base", str(self.doc.get_url())) self.xmldocument.set("title", unicode(self.doc.get_name())) # add menu if available self.xmlmenu = etree.SubElement(self.xmldocument, "menu") navigation = self.doc.get_menu() for menuitem in navigation: menuitemxml = etree.SubElement(self.xmlmenu, "menuitem") menuitemxml.text = unicode(menuitem) menuitemxml.set("link", unicode(str(navigation[menuitem]))) # add data regions for reg in self.regions: self.xmlsequence = etree.SubElement(self.xmldocument, "sequence-area") header = etree.SubElement(self.xmlsequence, "header") if reg.get_header_style() != None: header.set( "visibility", unicode(str(reg.get_header_style().get_visibility()))) header.text = unicode(reg.get_name()) # add records of the region for r in reg._manual_process_page(): item = etree.SubElement(self.xmlsequence, "entry") textxml = etree.SubElement(item, "text") textxml.text = unicode(r.get_text()) chunksxml = etree.SubElement(item, "chunks") # add chunks for chunk in r.get_chunks(): chxml = etree.SubElement(chunksxml, "chunk") chxml.text = unicode(chunk.get_text()) # show visibility if chunk.get_style() != None: chxml.set( "visibility", unicode(str(chunk.get_style().get_visibility()))) if chunk.get_link() != None: chxml.set("link", unicode(chunk.get_link())) # handle tag if chunk.get_tag() != None: tg = chunk.get_tag() if tg in HTMLSequenceWrapper._semantic_tags: tg = HTMLSequenceWrapper._semantic_tags[tg] chxml.set("logical", unicode(str(tg))) # handle comments if chunk.get_comment() != None: try: chxml.set( "comment", unicode(str(chunk.get_comment()), encoding='utf-8')) except UnicodeEncodeError: try: chxml.set( "comment", unicode(chunk.get_comment(), encoding='utf-8')) except TypeError: chxml.set("comment", chunk.get_comment()) def get_xml(self): """ Returns xml of result in string format. """ self._make_xml() # return the whole xml tree in string format return etree.tostring(self.xmldocument, xml_declaration=True, pretty_print=True, encoding='utf-8') def get_etree(self): """ Returns xml in lxml.etree.ElementTree object. """ self._make_xml() return self.xmldocument
def _parse_visibility(self, document): vis_map = self._get_visibility2elem_map(document.get_etree()) if len(vis_map) < 2: return sorted_vis = sorted(vis_map.keys(), reverse=True) if len(sorted_vis) < 2: return to_be_processed = None while 42: #:) to_be_processed = [] for i in xrange(0, len(sorted_vis)): if sorted_vis[i] < self.headercoef: continue to_be_processed.extend(vis_map[sorted_vis[i]]) if len(to_be_processed) < 2: self.headercoef -= 0.5 else: break # storage for possible titles possible_titles = ListedDict() # loop over all headers (elements containing very visible texts) for elem in to_be_processed: # get cleaned text content of the tag txt = SimpleHTMLCleaner.clean( elem.text_content() ) # generalize: maybe it is something useful hdrtext = self.generalizer.generalize(txt) # generalization found header beeing TITLE -> data are below header if hdrtext is not None: # found some useful header, try to get data below # what is below? probably sibling tags and their descendants self._get_data_below_header(elem, hdrtext, to_be_processed) # generalization wasnt successful -> maybe the header contains data else: # date? d = self.ee.find_published_date(txt) if d[0]: rrsdate = d[0][0] for attr in ('year', 'month'): if rrsdate.get(attr) is not None: self._publ[attr] = rrsdate.get(attr) txt = d[1] # maybe title if len(txt.split(" ")) > 3: # probably more than three words # is there a domain name in the title? So it is probably # general name of the website if len(self.domain) > 6 and re.search(re.escape(self.domain), txt, re.I): continue # preprocessing - remove standalone brackets txt = re.sub("[\(\[][^\)\]]*[\)\]]+", "", txt).strip() if document.name is not None and re.search(re.escape(txt), document.name, re.I): possible_titles[int(self._classify_publ_title(txt, init=100))] = txt elif len(txt.split(" ")) > 5: possible_titles[int(self._classify_publ_title(txt, init=60))] = txt if possible_titles: titles = possible_titles[max(possible_titles)] if len(titles) > 1: title = self._get_longest_string(titles) else: title = titles[0] self._publ['title'] = title self._publ['credibility'] = max(possible_titles) else: self._publ['credibility'] = 0 # store all new properties and their values for property in self._storage: self._add_property(property, self._storage[property])
class HTMLSequenceWrapper(object): """ HTMLSequenceWrapper is an itelligent system for pattern and repeating sequence recognition on web pages. Input of this algorithm is element tree object (lxml.etree._ElementTree) and output is instance of ParsedHTMLDocument. The sequencewrapper parses element tree to get most valuable repeating sequence which is supposed to be data record. It also finds out regions. """ # list of important terms to get menu. _menu = ('[CK]onta[ck]t', 'Publi[ck]', 'Blog', 'Links', 'About', 'Home', 'News?', \ 'Event', 'Research', 'Index', 'FAQ', 'People', 'Overview', 'Profile', \ 'Community', 'Download') # this list prolly shouldnt be here, but in some higher class what uses # HTMLSequenceWrapper to get page structure and semantics _semantic_tags = {'dfn': 'Definition Term', # <dfn> 'address': 'Address', # <address> 'em': 'Emphasis', # <em> 'strong': 'Strong Text', # <strong> 'ins': 'Inserted', # <ins> 'del': 'Delete', # <del> 'cite': 'Citation', # <cite> 'code': 'Computer code text', # <code> 'samp': 'Sample computer code text', # <samp> 'kbd': 'Keyboard text', # <kbd> 'var' : 'Variable'} # <var> def __init__(self, childcoef=7.0, headercoef=4.0, mintextlen=10, omitted_tags=('option', 'br', 'select', 'form')): self.sequences = {} self.childcoef = childcoef self.headercoef = headercoef self.mintextlen = mintextlen self.omitted_tags = omitted_tags self.records = [] self.cleaner = SimpleHTMLCleaner() def _append(self, elem, depth): if str(elem.tag) == '<built-in function Comment>': return key = elem.tag + "_" + str(depth) if not key in self.sequences: self.sequences[key] = [elem] else: self.sequences[key].append(elem) def _recurse(self, elem, depth): self._append(elem, depth) for child in elem.iterchildren(): self._recurse(child, depth+1) def _get_most_freq(self, seqdict, position=1): reversed_entries = {} for k in seqdict: reversed_entries[len(seqdict[k])] = seqdict[k] ordered = sorted(reversed_entries.keys(), reverse=True) # FILTERING TAGS # filter non-usable tags like <option>, <br> or <form> for i in range(len(ordered)): mf = reversed_entries[ordered[(position-1)+i]] if mf[0].tag not in self.omitted_tags: break return mf def _find_nearest_parent(self, elems): parents = {} for elem in elems: parent = elem.getparent() if parent == None: continue if parent.tag not in parents: parents[parent.tag] = [parent] else: if not parent in parents[parent.tag]: parents[parent.tag].append(parent) mf = self._get_most_freq(parents) #del parents return mf def _isbodyelem(self, elem): return elem.tag != None and elem.tag == 'body' def _sift(self, elems): sift = True while sift: parents = self._find_nearest_parent(elems) if self._isbodyelem(parents[0]): break sift = len(elems) < self.childcoef * len(parents) if sift: elems = parents self.sifted_first = elems[0] # improve speed by converting list to set try: return set(elems) except MemoryError: return elems def _find_regions(self): # delete previously found data self.regions = [] area = HTMLSequenceWrapperRegion() for elem in self.elemtree.getroot().iterdescendants(): _style = elem.style if _style is None: _style = CSSStyle() # we consider it to be a header if visibility self.headercoef if _style.get_visibility() >= self.headercoef: if not area.is_empty(): self.regions.append(area) area = HTMLSequenceWrapperRegion() area.set_name(self.cleaner.clean(elem.text)) area.set_header_style(_style) if elem in self.found_entries: rec = HTMLSequenceWrapperRecord(elem, self.url, self.mintextlen) if not rec.has_value(): continue area.add_record(rec) if not area.is_empty(): self.regions.append(area) def _find_menu(self, elemtree): _anchors = self.elemtree.findall('.//a[@href]') menuanchors = [] for a in _anchors: if a.text != None: for menuitem in HTMLSequenceWrapper._menu: if re.search(menuitem, a.text, re.I): menuanchors.append(a) break if not menuanchors: return # sift the menu with a different child coeficient coef = self.childcoef self.childcoef = 3.0 _menuitems = self._sift(menuanchors) self.childcoef = coef # get closest parent for all navigation items menu_reg = self._find_nearest_parent(_menuitems) _links = menu_reg[0].findall('.//a[@href]') for tag in _links: if tag == None: continue text = self.cleaner.clean(tag.text) if text == None: continue # bad heuristics, isn't it? if len(text) > 50: self.menu = {} return self.doc.add_menu_item(text, tag.get('href')) #--------------------------------------------------------------------------- ## checking unbalanced - wrap_h() methods #--------------------------------------------------------------------------- def _unbalanced_chunk_to_record_ratio(self): chunks, records = 0, 0 for reg in self.regions: for rec in reg._manual_process_page(): records += 1 chunks += len(rec.get_chunks()) try: return float(chunks)/float(records) < self.unbalanced_chunk_ratio except ZeroDivisionError: return True def _unbalanced_record_to_region_ratio(self): try: return (float(sum([len(reg._manual_process_page()) for reg in self.regions])) / \ float(len(self.regions))) < self.unbalanced_record_ratio except ZeroDivisionError: return True def _high_variablilty_of_chunk_count(self): chunks = [] for reg in self.regions: for rec in reg._manual_process_page(): if sum(chunks) == 0 or len(chunks) == 0 or \ len(rec.get_chunks()) > 3*(float(sum(chunks))/float(len(chunks))): chunks.append( len(rec.get_chunks()) ) aver = float(sum(chunks))/float(len(chunks)) base = 0 for x in chunks: base += (x - aver)**2 base /= len(chunks) return base > aver #--------------------------------------------------------------------------- # Public methods #--------------------------------------------------------------------------- def wrap_h(self, elemtree, url): """ Heuristical version of wrap() method. Warning: this method doesnt produce 100% correct result!! And also this method runs longer than wrap() cause of repeating parsing sequences. TODO: consider clustering methods. """ if not isinstance(elemtree, etree._ElementTree): raise TypeError("ElementTree has to be type lxml.etree._ElementTree") self.url = url self.doc = ParsedHTMLDocument(elemtree, url) # parse html document, css on page and in extern *.css files # this also makes all links absolute self.doc.parse_document() # store element tree self.elemtree = self.doc.get_etree() # recurse over tree self._recurse(self.elemtree.getroot(), 1) # get most frequented tag mf = self._get_most_freq(self.sequences) # learn satisfying_result_found = False # setting up average values of coeficients self.childcoef = 7.0 self.headercoef = 4.0 self.mintextlen = 40 self.unbalanced_chunk_ratio = 2.0 self.unbalanced_record_ratio = 3.0 iterations = 0 while not satisfying_result_found: iterations += 1 if iterations > 100: break # push it up to get parent tags, they could be record-keepers self.found_entries = self._sift(mf) # find data regions self._find_regions() # if we found only one region with one record, its probably a mistake # so we have to decrease childcoef if len(self.regions) == 1 and len(self.regions[0].records) == 1: self.childcoef -= 1.5 self.headercoef -= 1.0 elif self._unbalanced_chunk_to_record_ratio(): self.childcoef += 2.0 self.headercoef += 0.5 self.unbalanced_chunk_ratio -= 0.2 elif self._unbalanced_record_to_region_ratio(): self.headercoef += 1.0 self.childcoef -= 0.5 self.unbalanced_record_ratio -= 0.4 elif self._high_variablilty_of_chunk_count(): self.childcoef += 1.0 self.mintextlen += 10 else: satisfying_result_found = True # find navigation on page self._find_menu(self.elemtree) for reg in self.regions: self.doc.add_region(reg) #remember last url self.last_url = url # return parsed document return self.doc def wrap(self, elemtree, url): """ Main method. Parses html page and searches for repeated sequences in element tree. Returns instance of HTMLDocument. """ if not isinstance(elemtree, etree._ElementTree): raise TypeError("ElementTree has to be type lxml.etree._ElementTree") self.url = url self.doc = ParsedHTMLDocument(elemtree, url) # parse html document, css on page and in extern *.css files # this also makes all links absolute self.doc.parse_document() # store element tree self.elemtree = self.doc.get_etree() # recurse over tree self._recurse(self.elemtree.getroot(), 1) # get most frequented tag mf = self._get_most_freq(self.sequences) # push it up to get parent tags, they could be record-keepers self.found_entries = self._sift(mf) # find data regions self._find_regions() # find navigation on page self._find_menu(self.elemtree) for reg in self.regions: self.doc.add_region(reg) try: self.doc.set_name(self.elemtree.find('.//title').text) except AttributeError: pass #remember last url self.last_url = url # return parsed document return self.doc def _make_xml(self): """ Constructs xml tree containing result of wrapping. """ self.xmldocument = etree.Element("document") self.xmldocument.set("base", str(self.doc.get_url())) self.xmldocument.set("title", unicode(self.doc.get_name())) # add menu if available self.xmlmenu = etree.SubElement(self.xmldocument, "menu") navigation = self.doc.get_menu() for menuitem in navigation: menuitemxml = etree.SubElement(self.xmlmenu, "menuitem") menuitemxml.text = unicode(menuitem) menuitemxml.set("link", unicode(str(navigation[menuitem]))) # add data regions for reg in self.regions: self.xmlsequence = etree.SubElement(self.xmldocument, "sequence-area") header = etree.SubElement(self.xmlsequence, "header") if reg.get_header_style() != None: header.set("visibility", unicode(str(reg.get_header_style().get_visibility()))) header.text = unicode(reg.get_name()) # add records of the region for r in reg._manual_process_page(): item = etree.SubElement(self.xmlsequence, "entry") textxml = etree.SubElement(item, "text") textxml.text = unicode(r.get_text()) chunksxml = etree.SubElement(item, "chunks") # add chunks for chunk in r.get_chunks(): chxml = etree.SubElement(chunksxml, "chunk") chxml.text = unicode(chunk.get_text()) # show visibility if chunk.get_style() != None: chxml.set("visibility", unicode(str(chunk.get_style().get_visibility()))) if chunk.get_link() != None: chxml.set("link", unicode(chunk.get_link())) # handle tag if chunk.get_tag() != None: tg = chunk.get_tag() if tg in HTMLSequenceWrapper._semantic_tags: tg = HTMLSequenceWrapper._semantic_tags[tg] chxml.set("logical", unicode(str(tg))) # handle comments if chunk.get_comment() != None: try: chxml.set("comment", unicode(str(chunk.get_comment()), encoding='utf-8')) except UnicodeEncodeError: try: chxml.set("comment", unicode(chunk.get_comment(), encoding='utf-8')) except TypeError: chxml.set("comment", chunk.get_comment()) def get_xml(self): """ Returns xml of result in string format. """ self._make_xml() # return the whole xml tree in string format return etree.tostring(self.xmldocument, xml_declaration=True, pretty_print=True, encoding='utf-8') def get_etree(self): """ Returns xml in lxml.etree.ElementTree object. """ self._make_xml() return self.xmldocument
class HTMLSequenceWrapperRecord(object): def __init__(self, element, url, mintextlen=10): self.cleaner = SimpleHTMLCleaner() self.mintextlen = mintextlen self.elem = element self.url = url # the whole text self.text = self.elem.text_content() self.text = self.cleaner.clean(self.text) self.chunks = [] self.__extract_chunks(self.elem) def has_value(self): if self.cleaner.contains_text(self.text) == False: return False return len(self.text) > self.mintextlen def get_chunks(self): return self.chunks def get_text(self): return self.text def _handle_elem(self, elem): if elem.text == None: return None if not self.cleaner.contains_text(elem): return None # new chunk chunk = TextChunk() ## extracting links if elem.get('href') != None: chunk.set_link(elem.get('href')) # extracting 'title' atribute in anchor if elem.tag == 'a' and elem.get('title') != None: chunk.set_comment(elem.get('title')) # extracting text txt = elem.text_content() chunk.set_text(self.cleaner.clean(txt)) # setting style fs = elem.style chunk.set_style(fs) chunk.set_tag(elem.tag) return chunk def __extract_chunks(self, elem): thischunk = self._handle_elem(elem) if thischunk != None: self.chunks.append(thischunk) for child in elem.iterchildren(): self.__extract_chunks(child) def __str__(self): return "<"+__modulename__+".HTMLSequenceWrapperRecord instance " + self.text + " >"
def _get_data_below_header(self, elem, hdrtext, to_be_processed): # Try to iter over siblings of the header element and get text siblings = [sib.tag for sib in elem.itersiblings()] # the header is abstract if hdrtext == 'abstract': txts = {} paragraphs = [] par_stop = False for sib in elem.itersiblings(): content = sib.text_content() if sib in to_be_processed: par_stop = True if sib.tag == 'p' and len(content) > 50 and not par_stop: paragraphs.append(content) chunk = content[0:20].lower() score = 1.0 for st in self._abstract_startswith: if chunk.startswith(st): score*=5.0 score *= len(content) txts[score] = SimpleHTMLCleaner.clean(content) if paragraphs: self._storage[hdrtext] = [SimpleHTMLCleaner.clean(" ".join(paragraphs))] else: self._storage[hdrtext] = [ txts[max(txts.keys())] ] # related publications elif hdrtext == 'related': list_tags = ('ul', 'ol', 'dl') return # TODO for ltag in list_tags: if ltag in siblings: for sib in elem.itersiblings(): pass # keywords elif hdrtext == 'keywords': # create function returning elements containing possible keywords is_keyword = lambda kw: re.search("^(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}$", \ kw.text_content(), re.I) \ and not re.search("[@#\$%\^&\*\(\)]", kw.text_content()) # iter over siblings of header a try to get keywords from its children likelihood_to_keyword_tags = ListedDict() for s in elem.itersiblings(): (kw_elems, likelihood) = self._find_local_sequence(s, is_keyword) if kw_elems is None: continue likelihood_to_keyword_tags[likelihood] = kw_elems if not likelihood_to_keyword_tags: return # if found some keywords, store them self._storage[hdrtext] = [kw.text_content() for kw in likelihood_to_keyword_tags[max(likelihood_to_keyword_tags.keys())][0]] # references elif hdrtext == 'references': pass # TODO # chapters ?? elif hdrtext == 'chapters': pass # TODO # reviews? elif hdrtext == 'reviews': if hdrtext in self._storage: return # create function returning elements containing possible reviews is_review = lambda r: (len(r.text_content()) > 100) or r.tag == 'blockquote' probability = ListedDict() # iter over siblings of header a try to get reviews from its children for s in elem.itersiblings(): (elems, prob) = self._find_local_sequence(s, is_review) if elems is None: continue probability[prob] = elems review_texts = [] if not probability: return for e in probability[max(probability.keys())][0]: review_texts.append(SimpleHTMLCleaner.clean(e.text_content())) # set all the elements as "processed" to avoid further processing for d in e.iter(): d.processed = True self._storage[hdrtext] = review_texts
def _parse_visibility(self, document): vis_map = self._get_visibility2elem_map(document.get_etree()) if len(vis_map) < 2: return sorted_vis = sorted(vis_map.keys(), reverse=True) if len(sorted_vis) < 2: return to_be_processed = None while 42: #:) to_be_processed = [] for i in xrange(0, len(sorted_vis)): if sorted_vis[i] < self.headercoef: continue to_be_processed.extend(vis_map[sorted_vis[i]]) if len(to_be_processed) < 2: self.headercoef -= 0.5 else: break # storage for possible titles possible_titles = ListedDict() # loop over all headers (elements containing very visible texts) for elem in to_be_processed: # get cleaned text content of the tag txt = SimpleHTMLCleaner.clean(elem.text_content()) # generalize: maybe it is something useful hdrtext = self.generalizer.generalize(txt) # generalization found header beeing TITLE -> data are below header if hdrtext is not None: # found some useful header, try to get data below # what is below? probably sibling tags and their descendants self._get_data_below_header(elem, hdrtext, to_be_processed) # generalization wasnt successful -> maybe the header contains data else: # date? d = self.ee.find_published_date(txt) if d[0]: rrsdate = d[0][0] for attr in ('year', 'month'): if rrsdate.get(attr) is not None: self._publ[attr] = rrsdate.get(attr) txt = d[1] # maybe title if len(txt.split(" ")) > 3: # probably more than three words # is there a domain name in the title? So it is probably # general name of the website if len(self.domain) > 6 and re.search( re.escape(self.domain), txt, re.I): continue # preprocessing - remove standalone brackets txt = re.sub("[\(\[][^\)\]]*[\)\]]+", "", txt).strip() if document.name is not None and re.search( re.escape(txt), document.name, re.I): possible_titles[int( self._classify_publ_title(txt, init=100))] = txt elif len(txt.split(" ")) > 5: possible_titles[int( self._classify_publ_title(txt, init=60))] = txt if possible_titles: titles = possible_titles[max(possible_titles)] if len(titles) > 1: title = self._get_longest_string(titles) else: title = titles[0] self._publ['title'] = title self._publ['credibility'] = max(possible_titles) else: self._publ['credibility'] = 0 # store all new properties and their values for property in self._storage: self._add_property(property, self._storage[property])
def _get_data_below_header(self, elem, hdrtext, to_be_processed): # Try to iter over siblings of the header element and get text siblings = [sib.tag for sib in elem.itersiblings()] # the header is abstract if hdrtext == 'abstract': txts = {} paragraphs = [] par_stop = False for sib in elem.itersiblings(): content = sib.text_content() if sib in to_be_processed: par_stop = True if sib.tag == 'p' and len(content) > 50 and not par_stop: paragraphs.append(content) chunk = content[0:20].lower() score = 1.0 for st in self._abstract_startswith: if chunk.startswith(st): score *= 5.0 score *= len(content) txts[score] = SimpleHTMLCleaner.clean(content) if paragraphs: self._storage[hdrtext] = [ SimpleHTMLCleaner.clean(" ".join(paragraphs)) ] else: self._storage[hdrtext] = [txts[max(txts.keys())]] # related publications elif hdrtext == 'related': list_tags = ('ul', 'ol', 'dl') return # TODO for ltag in list_tags: if ltag in siblings: for sib in elem.itersiblings(): pass # keywords elif hdrtext == 'keywords': # create function returning elements containing possible keywords is_keyword = lambda kw: re.search("^(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}$", \ kw.text_content(), re.I) \ and not re.search("[@#\$%\^&\*\(\)]", kw.text_content()) # iter over siblings of header a try to get keywords from its children likelihood_to_keyword_tags = ListedDict() for s in elem.itersiblings(): (kw_elems, likelihood) = self._find_local_sequence(s, is_keyword) if kw_elems is None: continue likelihood_to_keyword_tags[likelihood] = kw_elems if not likelihood_to_keyword_tags: return # if found some keywords, store them self._storage[hdrtext] = [ kw.text_content() for kw in likelihood_to_keyword_tags[max( likelihood_to_keyword_tags.keys())][0] ] # references elif hdrtext == 'references': pass # TODO # chapters ?? elif hdrtext == 'chapters': pass # TODO # reviews? elif hdrtext == 'reviews': if hdrtext in self._storage: return # create function returning elements containing possible reviews is_review = lambda r: (len(r.text_content()) > 100 ) or r.tag == 'blockquote' probability = ListedDict() # iter over siblings of header a try to get reviews from its children for s in elem.itersiblings(): (elems, prob) = self._find_local_sequence(s, is_review) if elems is None: continue probability[prob] = elems review_texts = [] if not probability: return for e in probability[max(probability.keys())][0]: review_texts.append(SimpleHTMLCleaner.clean(e.text_content())) # set all the elements as "processed" to avoid further processing for d in e.iter(): d.processed = True self._storage[hdrtext] = review_texts