def __init__(self, verbose=False, debug=False): self.__dbg__ = debug self.__verbos = verbose self._crawler = Crawler() self._crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) self._unwanted_titles = ['Download here', 'PDF format'] self._records = [] ################################ #manual processing self.agent = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1
def __init__(self, url, verbose=False, debug=False, addkeyw=None): # keywords used for document page search self._sigwords = [ "d((eliverables?)|[0-9])", "documents?", "reports?", "public(ation)?s?", "results?", "presentations?", "library", #"projects?", "outocomes?", "downloads?", "outputs?" ] if addkeyw != None: self._sigwords.append(addkeyw) """ Associative array containing links with their flags { url : [Index/NoIndex/Frame, Visit/Visited, Rank] } index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """ self._link_stack = {url: [0, 0, 0]} self.base_url = url # save base (input) url # Open an parsing agent to get needed data from page self.agent = GetHTMLAndParse() self._current_url = url # a constant used to set rank in order of importance of the expression # being tested (self._sigwords) self.rank_const = len(self._sigwords) # few a constants for dictionary - just for good-looking source code self.IND_FR = 0 # index/noindex/frame/special self.VISIT = 1 # unvisited/visited self.RANK = 2 # value of rank # set verbose flag self.__verbose__ = verbose #set debug flag self.__dbg__ = debug # checking data types if not type(self.__verbose__) == bool: raise ValueError("Verbose flag has to be boolean.")
def __init__(self, verbose=False, debug=False): self.__dbg__ = debug self.__verbos = verbose self._crawler = Crawler() self._crawler.set_headers( ( ( "User-Agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19", ), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"), ) ) self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) self._unwanted_titles = ["Download here", "PDF format"] self._records = [] ################################ # manual processing self.agent = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() self._omitted_tags = ("br", "img", "html", "body") # tag tolerance self.tagtol = 1
def __init__(self, verbose=False, debug=False): # init agent for parsing html self.htmlHandler = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() # list of acceptable words in title (header) of table self.table_sem_words = [ 'deliverable', 'description', 'name', 'date', 'dissemination', 'no.', 'wp', 'delivery', 'particip', 'title', 'nature' ] self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 # verbose and debug flags self.debugger = DeliverableDebugger(verbose=verbose, debug=debug) self.__verbose = self.debugger.verbose self.__debug = self.debugger.debug
def __init__(self, url, verbose=False, debug=False, addkeyw=None): # keywords used for document page search self._sigwords = [ "d((eliverables?)|[0-9])", "documents?", "reports?", "public(ation)?s?", "results?", "presentations?", "library", # "projects?", "outocomes?", "downloads?", "outputs?", ] if addkeyw != None: self._sigwords.append(addkeyw) """ Associative array containing links with their flags { url : [Index/NoIndex/Frame, Visit/Visited, Rank] } index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """ self._link_stack = {url: [0, 0, 0]} self.base_url = url # save base (input) url # Open an parsing agent to get needed data from page self.agent = GetHTMLAndParse() self._current_url = url # a constant used to set rank in order of importance of the expression # being tested (self._sigwords) self.rank_const = len(self._sigwords) # few a constants for dictionary - just for good-looking source code self.IND_FR = 0 # index/noindex/frame/special self.VISIT = 1 # unvisited/visited self.RANK = 2 # value of rank # set verbose flag self.__verbose__ = verbose # set debug flag self.__dbg__ = debug # checking data types if not type(self.__verbose__) == bool: raise ValueError("Verbose flag has to be boolean.")
def __init__(self, verbose=False,debug=False): # init agent for parsing html self.htmlHandler = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() # list of acceptable words in title (header) of table self.table_sem_words = ['deliverable', 'description', 'name', 'date', 'dissemination', 'no.', 'wp', 'delivery', 'particip', 'title', 'nature'] self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 # verbose and debug flags self.debugger = DeliverableDebugger(verbose = verbose,debug = debug) self.__verbose = self.debugger.verbose self.__debug = self.debugger.debug
def __init__(self, options=opt, url=None): # get options self.opt = options if url != None: self.opt_url = url else: self.opt_url = self.opt.url # initialize main html handler and parser self.htmlhandler = GetHTMLAndParse() # searching deliverable page self.pagesearch = GetDelivPage(self.opt_url, verbose=self.opt.verbose, debug=self.opt.debug, addkeyw=self.opt.regexp) # extracting informations from page self.recordhandler = GetDelivRecords(debug=self.opt.debug)
class GetDelivRecords: def __init__(self, verbose=False, debug=False): self.__dbg__ = debug self.__verbos = verbose self._crawler = Crawler() self._crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) self._unwanted_titles = ['Download here', 'PDF format'] self._records = [] ################################ #manual processing self.agent = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 def __debug(self, msg): _err = "cannot decode debug info." if self.__dbg__ == True: try: print("Debug message: " + str(msg)) except UnicodeError: print(_err) def __verbose(self, msg): _err = "cannot decode debug info." if self.__verbose == True: try: print("Verbose: " + str(msg)) except UnicodeError: print(_err) ########################Processing sequencewrapper output###################### """function gets an entry from output of sequence wrapper it tries to create deliv record and retruns true if succed. """ def _make_deliv_record(self, entry): text = [] links = [] #harvest links and text form entry for e in entry.iter(): if e.text != None: text.append(e.text) if e.attrib.get("link") != None: if self.agent.is_wanted_mime(e.attrib.get( "link")) and e.attrib.get("link") not in links: links.append(e.attrib.get("link")) res = self._deliv_in_text(text, links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") return True elif type(res) == list: res = self._more_entry_in_record(entry) if (res == True): self.__debug("") return True else: return False res = self._deliv_in_link(text, links, entry) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") return True return False """look for keyword in text""" def _deliv_in_text(self, text, links): #print text #print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)", re.I) #loop through text in entry looking for title and description for t in text: if _title == False: if pattern.search(t): _title = t #set the longest string as description of deliverable if len(_description) < len(t): _description = t if _title == _description: _description = "" _link = False if type(links) == str: if self.agent.is_wanted_mime(links): _link = links elif type(links) == list: for l in links: if self.agent.is_wanted_mime(l): if _link == False: _link = l else: #if there was already found link if _link[:s.rfind(_link, '.')] == l[:s.rfind(l, '.')]: break else: return ['-3', 'Probably more records in one entry'] #create object if _title: #print "TITLE:"+_title pub = RRSPublication(title=_title, abstract=_description) _typ = RRSPublication_type(type='techreport') pub['type'] = _typ self.__debug("*" * 40) self.__debug("Title: " + _title) self.__debug("Description: " + _description) if _link: #print "LINK:"+_link self.__debug("Link: " + _link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """look for a key word in link""" def _deliv_in_link(self, text, links, entry=False): ##print text ##print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)", re.I) _link = False for l in links: if pattern.search(l): if _link == False: _link = l else: return ['-3', 'Probably more records in one entry'] #loop through text in entry looking for title and description for t in text: if _title == False: if len(t) > 10: _title = t #set the longest string as description of deliverable if len(_description) < len(t): _description = t if _title == _description: _description = "" #if chosen title is not valid try to find better in parent entry if _title and not self._check_title(_title) and entry != False: _title = self._repair_title(entry) #create object if _link: pub = RRSPublication(title=_title, abstract=_description) typ = RRSPublication_type(type='techreport') pub['type'] = typ self.__debug("*" * 40) self.__debug("Title: " + _title) self.__debug("Description: " + _description) self.__debug("Link: " + _link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """Check if title contents only unwanted string with some tolerance return true if title is ok """ def _check_title(self, title, tolerance=10): for t in self._unwanted_titles: if (s.find(s.lower(title), s.lower(t))) != -1: if (len(t) + tolerance) > len(title): return False return True "looks for an element with highest visibility rank in parent elemet" def _repair_title(self, entry): parent = entry.getparent() visibility = 0 title = "" for i in parent.iter(): try: if i.attrib.get('visibility') > visibility: visibility = i.attrib.get('visibility') title = i.text except AttributeError: pass if title != "": return title else: return False "Function try to create array of deliverables from one entry in xml tree" def _more_entry_in_record(self, entry): for ch in entry.iter('chunk'): if ch.text != None and ch.attrib.get("link") != None: if self.agent.is_wanted_mime(ch.attrib.get("link")): _pub = RRSPublication(title=ch.text) typ = RRSPublication_type(type='techreport') _pub['type'] = typ _l = RRSUrl(link=ch.attrib.get("link")) _rel = RRSRelationshipPublicationUrl() _rel.set_entity(_l) _pub['url'] = _rel self._entriesFoundInLinks.append(_pub) "Process pages definied by urls" def process_pages(self, pages): self._entriesFoundInText = [] self._entriesFoundInLinks = [] self._urls = pages self._pages = self._crawler.start(pages) #creates RRSPublication objects with information about deliverables for u in self._urls: self._wraper.wrap(self._pages[u], u) self._tree = self._wraper.get_etree() #print self._wraper.get_xml() for entry in self._tree.iter("entry"): self._make_deliv_record(entry) if len(self._entriesFoundInText) > 3: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks) > 3: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks else: self._manual_processing() "This method is called when ther was no records found in output of sequencewrapper" def _manual_processing(self): self._entriesFoundInLinks = [] self._entriesFoundInText = [] self._manual_process_page(self._urls, urlsplit(self._urls[0])[1]) if len(self._entriesFoundInText) > 0: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks) > 0: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks ########################### TABLE HANDLING METHODS ############################ """ Get texts from element and his descendants. If string isset, returns texts as one string with spaces. # elem: lxml element """ def _get_descendats_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get link from row of table - go through columns and the only href leading to deliverable is returned. """ def _get_row_link(self, row): # find all anchors where parent is row linkanch = row.findall('.//a[@href]') if len(linkanch) == 0: return None for link in linkanch: anchor_link = link.get('href') if self.agent.is_wanted_mime( anchor_link): # check if it is file we want return anchor_link return None """ Handle region as a table. Work with region as it's a table. Try to get table semantic (table order) and get all records out of it. """ def _handle_table(self): for row in self.parentetree: if not row.tag == 'tr': continue row_list = [] _thislink = self._get_row_link(row) if _thislink == None: continue for column in row: text = self._get_descendats_texts(column) if not text: continue row_list.append(text) res = self._deliv_in_text(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") del (row_list) return ######################## TAG SEQUENCE RECOGNIZING METHODS #################### """ Tag check. If it is anchor with href leading to deliverable, returns True """ def _is_deliv_anch(self, tag): if tag.tag == 'a': href = tag.get('href') if self.agent.is_wanted_mime(href): return True return False """ Filters useless and messy tags. Return false if useless, true if normal tag """ def _tagfilter(self, tag): if tag.tag in self._omitted_tags: return False #if tag.text: # if not re.search("[a-z0-9\[\]]", tag.text, re.I): # return False return True """ Gets difference between first two anchors. """ def _getdiff(self, reg, tol): # etree reg = element tree region # int tol: accepted tolerance of tags d = {} index = 0 # fill the dictionary with differences and their occurences for tag in reg.iter(): if not self._tagfilter(tag): continue if self._is_deliv_anch(tag) and not index == 0: try: d[index] += 1 except: d[index] = 1 index = 0 index += 1 # check differencies if the variety isn't higher then $tol tolerance difflist = d.keys() self.__debug("difflist: " + str(difflist)) if len(difflist) == 0: return -1 _max = max(difflist) _min = min(difflist) dlen = len(d.keys()) if dlen == 1: return d.keys()[0] if dlen > ((2 * tol) + 1): # tolerance to both sides return -1 if (_max - _min) > 2 * tol: # some acceptable tolerance return -1 # get the most frequent difference most_freq = max(d.values()) for key in d: if d[key] == most_freq: return key return -1 """ Only anchors found. No optional information. """ def _get_anch_only(self): anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree) # We have to make list of list because XMLOutput return [[anch] for anch in anchlist] """ Main method handling tag sequences and recognizing records. Returns list of records. """ def _get_tag_sequences(self, tag_tol=1): records = [] self._rec = [] if len(self.parentetree) == 0: return [[self.parentetree]] # get interval between anchors, use tolerance tag_tol self.difference = self._getdiff(self.parentetree, self.tagtol) while self.difference == -1: if self.tagtol > 5: self.__verbose("Variety of intervals between anchors is too huge. "+\ "Getting data out of anchors only") return self._get_anch_only() self.tagtol += 1 self.difference = self._getdiff(self.parentetree, self.tagtol) # get sequence of first n tags, where n is average interval between anchors # this could be tag-sequence describing all records in region. self.record_seq = [] i = 0 for tag in self.parentetree.iter(): if not self._tagfilter(tag): continue if i >= self.difference: if not 'a' in self.record_seq: del self.record_seq[0] else: break self.record_seq.append(tag.tag) i += 1 # counter indicates on which position in tag sequence we actually are counter = 0 # make sequence of tags as they go regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()]) recseqlen = len(self.record_seq) reglistlen = len(regionlist) # flag indicating begin of records - in region on the beginning can be some garbage self.begin = False # indicating unpredictable separator between deliverable records self.separator = 0 for i, tag in enumerate(regionlist): # skip and save the sequence at the end if counter > self.difference - 1: records.append(self._rec) # save self._rec = [] # erase the list counter = 0 # reset counter if not self.begin: if tag.tag != self.record_seq[0]: continue else: try: if regionlist[i + 1].tag != self.record_seq[1]: continue except: pass self.begin = True # handle tolerances, try to compare sibling tags self.match = False # match flag # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence. for tol in range(self.tagtol + 1): if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \ regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter += tol + 1 break elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \ regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter -= tol counter += 1 break # if nothing matched, its probably out of tolerance if not self.match: self.separator += 1 # tolerance 10 separators (tags between boxes or tables of deliverables) if self.separator > 10: self.__verbose("Tag sequence doesnt match, probably out of "+\ "tolerance, getting data out of anchors only") # maybe here could be tolerance++ # we didnt catch the sequence with tolerance... return self._get_anch_only() records.append(self._rec) return filter(self._validseq, records) """ Helper method - check if sequence of tags rec contains deliv anchor """ def _validseq(self, rec): for _atr in rec: # if we have anchor containing link to document, return true if self._is_deliv_anch(_atr): return True return False """ Get element texts only, dont look for descendants texts """ def _get_tag_content(self, tag): links = [] texts = [] if tag.tag == 'a': href = tag.get('href') # if link leading to document found, add string to list if href is not None and self.agent.is_wanted_mime(href): links.append(self.formatter.format(href)) title = tag.get('title') # if title found in tag, add string to list if title: texts.append(self.formatter.format(title)) # if not anchor, search text in tag.text if tag.text: if re.search("[a-z0-9]", tag.text, re.I): texts.append(self.formatter.format(tag.text)) return [links, texts] """ Harvest texts out of tags and return list of lists (record) """ def _harvest_text(self, record_tag_list): self._records = [] self._rec = [] _links = [] _texts = [] # loop over records and search all possible useful texts for rec_list in record_tag_list: for tag in rec_list: harvested = (self._get_tag_content(tag)) _links.extend(harvested[0]) _texts.extend(harvested[1]) #self._records.append(self._rec) res = self._deliv_in_text(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") _links = [] _texts = [] self._rec = [] return self._records """ Text harvesting for sequences. """ def _handle_sequence(self): seq = self._get_tag_sequences() return self._harvest_text(seq) """ Get records from region according document links this method is used when there was no records found in output of sequencewrapper""" def _manual_process_page(self, links, baseurl): _err = None self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree self.__debug(_err) continue #make all links absolute in parent tree hrefs = self.parentetree.findall('.//a[@href]') for href in hrefs: href.make_links_absolute('http://' + urlsplit(link)[1] + '/') # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset( self.regionHandler.formatter.get_charset()) self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " + "*" * 40) self.__debug( lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table', 'tbody'): self.__verbose("Handling table") self._handle_table() else: self.__verbose("Handling sequences") self._handle_sequence() #############PUBLIC METHODS TO GET RESULTS def get_deliverables_XML(self): """return infromations about deliverables stored in objects as xml""" if len(self.get_deliverables()) == 0: return derrno.__err__(derrno.ENOREC) output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(self.get_deliverables()) result = output.getvalue() output.close() return result def get_deliverables(self): """return objects containing infromations""" if len(self._records) == 0: return derrno.__err__(derrno.ENOREC) else: return self._records
def __init__(self): # init agent for parsing html self.agent = GetHTMLAndParse() # format text self.formatter = TextFormatUtils()
class GetDeliverableRegion: def __init__(self): # init agent for parsing html self.agent = GetHTMLAndParse() # format text self.formatter = TextFormatUtils() """ Get data region. Returns element tree with region where are deliverables stored """ def get_region(self, url, base, tolerance): _res = self.agent.ghap(url) if len(_res) == 0: return derrno.__err__(errmsg) else: self._page = self.agent.get_etree() deliv_elements = self.agent.find_anchor_elem(base=base) if len(deliv_elements) == 0: return derrno.__err__(derrno.ENODOC, url) if len(deliv_elements) == 1: return lxml.etree.ElementTree(deliv_elements[0]) # get parent tag of all deliverable anchors parent_element = self._get_common_parent(deliv_elements, tolerance) if parent_element == None: return derrno.__err__(derrno.ENOREG, "Parent element not found.") # get the region out of the parent element region = self._get_deliverable_region(parent_element) # if parent tag is region if region == 0: # return element tree made from parent tag return lxml.etree.ElementTree(parent_element) else: print #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True) return region # else return region """ Stabile searching parent of all elements in elem_list using method of making element parent vectors and comparing them. Tolerance of n tags makes the region smaller if there are >>not deliverable<< pdfs in more regions on the page.""" def _get_common_parent(self, elem_list, tolerance): # supporting method - kind of bigger lambda. Get minimal length of # inside lists. def _minlength(seq_list): return min([len(seq) for seq in seq_list]) # next supporting method: check the elements in list. # if elements are the same, its common parent tag - return True. def _iscommon(elem_seq, tol): tol_list = [] for elem in elem_seq: if not elem in tol_list: tol_list.append(elem) if len(tol_list) > tol + 1: return False # if only two anchors found then we have only two tags # and its pretty hard to use tolerance, so we omit it. if len(elem_seq) < 3 and len(tol_list) > 1: return False return True # get the most frequenced tag in list def _most_frequent(seq): suplist = [] suplist_freq = [] for el in seq: if not el in suplist: suplist.append(el) suplist_freq.append(int(1)) else: suplist_freq[suplist.index(el)] += 1 ind = suplist_freq.index(max(suplist_freq)) return suplist[ind] # # now continue with method _get_common_parent() # vectors = [] # here will be vectors stored - list of lists for self.elem in elem_list: _vector = [] while 1: parent = self.elem.getparent() # exception possible here if parent == None: break _vector.append(parent) self.elem = parent vectors.append(_vector) # We have parent vectors of all elements from elem_list stored in list # $vectors. Then zip the vector list and get sequences of parent tags (and the # other tags) sorted from the highest to the lowest parent element. zipped = [[row[-i] for row in vectors] for i in range(1, _minlength(vectors) + 1)] # now check all lists in list zipped. If these are filled with the same # elements, its a common parent. The last list before difference contains # the main parent tag. self.last_seq = [] for zipvect in zipped: if not _iscommon(zipvect, tolerance): # return most frequented element in last vector return _most_frequent(self.last_seq) self.last_seq = zipvect return _most_frequent(self.last_seq) """ Get texts from element and his descendants. If string is True, returns texts as one string with spaces. elem: lxml element """ def _get_element_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get deliverable region - returns etree with region. If 0 returned parent_tag is region, if -1 returned some error occured searching, if html string returned its a region. """ def _get_deliverable_region(self, parent_tag): def _convert_tag_to_html(tag): tag_html = lxml.etree.ElementTree(tag) return lxml.etree.tostring(tag_html) # list[0] = type, list[1] = atribute, list[2] = lxml tag element # in case of headers list[0] = element.tag, then [2] is element _reg_atr = ['', None, None] self._result_html_region = '' reg_flag = False # flag indicating that we are looping over region # get headers first headers = [] #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True) for i in range(1, 7): headers.extend(parent_tag.findall('.//h' + str(i))) children = parent_tag.getchildren() if len(headers) > 0: for head in headers: text = self._get_element_texts(head) if text: if re.search("deliverables", text, re.I): _reg_atr[0] = head.tag _reg_atr[2] = head break if _reg_atr[2] == None: return 0 # visit all tag in parent_tag for tag in parent_tag.iterdescendants(): if tag.tag == 'img': continue text = self._get_element_texts(tag) if tag.tag == 'a' and not tag.text: if tag.find('img') is not None: text = tag.find('img').tail else: text = ' ' if text: if re.search("deliverables", text, re.I) and \ tag.tag == _reg_atr[0]: # "deliverable" title, BEGIN of region reg_flag = True elif not re.search("deliverables", text, re.I) and \ tag.tag == _reg_atr[0]: # next similar title, END of region if reg_flag: break # region content if tag in children and reg_flag: self._result_html_region += _convert_tag_to_html(tag) elif tag.getparent() in children and reg_flag: self._result_html_region += _convert_tag_to_html( tag.getparent()) children.remove(tag.getparent()) # if we dont have headers, try to find other kind of header (title) # "Deliverables" and compare with other elements with the same class or id. else: for tag in parent_tag.iter(): if tag.text: if re.search("deliverables", tag.text, re.I): if tag.get("class"): _reg_atr[0] = 'class' _reg_atr[1] = tag.get("class") _reg_atr[2] = tag break elif tag.get("id"): _reg_atr[0] = 'id' _reg_atr[1] = tag.get("id") _reg_atr[2] = tag break elif tag.get("style"): _reg_atr[0] = 'style' _reg_atr[1] = tag.get("style") _reg_atr[2] = tag break # test _reg_atr. If there is no deliverable region, then all # documents make the region if _reg_atr[2] == None: return 0 reg_flag = False # visit all tag in parent_tag for tag in parent_tag.iterdescendants(): if tag.tag == 'a' and not tag.text: if tag.find('img') is not None: tag.text = tag.find('img').tail else: tag.text = ' ' if tag.text: if re.search("deliverables", tag.text, re.I) and \ tag.get(_reg_atr[0]) == _reg_atr[1]: # "deliverable" title, BEGIN of region reg_flag = True elif not re.search("deliverables", tag.text, re.I) and \ tag.get(_reg_atr[0]) == _reg_atr[1]: # next similar title, END of region if reg_flag: break # region content if tag in children and reg_flag: self._result_html_region += _convert_tag_to_html(tag) children.remove(tag) elif tag.getparent() in children and reg_flag: self._result_html_region += _convert_tag_to_html( tag.getparent()) children.remove(tag.getparent()) if not self._result_html_region: return 0 # create ElementTree from region try: return lxml.etree.fromstring(self._result_html_region) except: try: parser = lxml.etree.HTMLParser() return lxml.etree.fromstring(self._result_html_region, parser) except lxml.etree.XMLSyntaxError: return 0
class GetDelivPage: def __init__(self, url, verbose=False, debug=False, addkeyw=None): # keywords used for document page search self._sigwords = ["d((eliverables?)|[0-9])", "documents?", "reports?", "public(ation)?s?", "results?", "presentations?", "library", #"projects?", "outocomes?", "downloads?", "outputs?"] if addkeyw != None: self._sigwords.append(addkeyw) """ Associative array containing links with their flags { url : [Index/NoIndex/Frame, Visit/Visited, Rank] } index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """ self._link_stack = { url : [0,0,0] } self.base_url = url # save base (input) url # Open an parsing agent to get needed data from page self.agent = GetHTMLAndParse() self._current_url = url # a constant used to set rank in order of importance of the expression # being tested (self._sigwords) self.rank_const = len(self._sigwords) # few a constants for dictionary - just for good-looking source code self.IND_FR = 0 # index/noindex/frame/special self.VISIT = 1 # unvisited/visited self.RANK = 2 # value of rank # set verbose flag self.__verbose__ = verbose #set debug flag self.__dbg__ = debug # checking data types if not type(self.__verbose__) == bool: raise ValueError("Verbose flag has to be boolean.") def __verbose(self, msg): _err = "cannot decode verbose message." if self.__verbose__ == True: try: print(str(msg)) except UnicodeError: print(_err) def __debug(self, msg): _err = "cannot decode debug info." if self.__dbg__ == True: try: print("Debug message: "+str(msg)) except UnicodeError: print(_err) ################################################################################ """ Initialize item in dictionary to noindex/unvisited/rank=0 """ def _link_item_init__(self, link, index=1, visit=0, rank=0): # default setting: noindex,unvisited,norank if not self._link_stack.has_key(link): self._link_stack[link] = [index,visit,rank] return """ Edits item in dictionary self._link_stack """ def _link_item_edit(self, link, index=None, visit=None, rank=None): if index is not None: self._link_stack[link][self.IND_FR] = index if visit is not None: self._link_stack[link][self.VISIT] = visit if rank is not None: # null rank if zero is argument if rank == 0: self._link_stack[link][self.RANK] = 0 # add rank else: self._link_stack[link][self.RANK] += rank return """ Method representing one level of cascade. Do almost any job to search one word in dictionary """ def _level_job(self, index=None): # get list of links from anchors containing one of expression # from self_sigwords result = 0 if index is not None: # searching with one link_list = self.agent.get_all_links( regul = re.compile(self._sigwords[index], re.I), base = self._current_url) else: link_list = self.agent.get_all_links(base = self._current_url) index = self.rank_const if link_list: # # RANK giving & filter # if index is None: rank = 0 elif index == 0: rank = self.rank_const * 2 else: rank = self.rank_const - index for link in link_list: # GTFO javascript if not link or "javascript:" in link or "mailto:" in link: continue if "#" in link: # if pointer delete it link = re.sub('#.*$', '', link) if len(link) > 200: continue if self._link_stack.get(link): # RANK if you see those links for first if self._link_stack[link][self.VISIT] == 0: self._link_item_edit(self._current_url, rank=rank) continue if not self.agent.compare_domains(self.base_url, link): continue split_link = re.sub("https?://.+?/", "", link) # check whether it is file or not if self.agent.is_wanted_mime(link): # # Some PDF or DOC found # # RANK self._link_item_edit(self._current_url, rank=10) self.__debug("Added rank 10 to "+self._current_url) # if re.search("de?l?(iverable)?[0-9]+([\._-][0-9])?", split_link, re.I): self.__debug("Type D on "+self._current_url) # debug print # RANK self._link_item_edit(self._current_url, rank=100) continue elif not self.agent.is_page(link): continue self.__debug("UNWATED") # # Add link # # RANK # initialization of link item in dict self._link_item_init__(link) self._link_item_edit(self._current_url, rank=rank) result += 1 # debug print self.__debug("ADD "+link[7:60]) self.__debug("Rank "+str(rank)+" "+self._current_url) return result """ Cascade search. May improve the speed of script """ def _cascade_search(self): result = 0 # first cascade - look for links cont. deliverables result += self._level_job(0) if not result == 0: return # second cascade - look for links cont. documents and publications result += self._level_job(1) result += self._level_job(2) if not result == 0: return # last cascade - all the rest for i in range(3,self.rank_const): result += self._level_job(i) # check Intro page (all links) only on index if result == 0 and self._link_stack[self._current_url][0] == 0: result += self._level_job() """if result == 0: # RANK DOWN self._link_item_edit(self._current_url, rank=0) print "No anchors on the page""" return """ TRY TO repair link. But for now only append / in base """ def _repair_links(self, base=None): if base is None: base = self.base_url if re.match(".*[^/]$", base): base += "/" if self.agent.get_etree() == -1: return -1 links = self.agent.get_all_links(base = base) # compare link with base url for link in links: if not self.agent.compare_domains(self.base_url, link): continue link = re.sub("https?://.+?/", base, link) # if match, save it as special case self._link_item_init__(link, index=3) """ Checking intro page. It is page without content, only with Enter label """ def _check_intro(self): links = self.agent.get_all_links(base = self._current_url) self.__debug("We've found intro links: "+str(links)) for link in links: if not self.agent.compare_domains(self.base_url, link): continue # save new link as normal page self._link_item_init__(link, index=1) """ Looks for frames on the page """ def _check_frames(self): frames = self.agent.look_for_frame(base = self._current_url) if not frames: return None fcount = len(frames) # debug print self.__debug("We've found frames ("+str(fcount)+") on "+self._current_url) # save new link as frame page for link in frames: if self.agent.compare_domains(self._current_url, link): self._link_item_init__(link, index=2) return fcount """ Checks for titles and gives rank according the result """ def _check_titles(self): for i in range(self.rank_const): hcount = self.agent.count_all_headers( re.compile( self._sigwords[i], re.I )) if not hcount == 0: if i == 0: # # "deliverable" match, the highest rank # # RANK constant is multiplied by 4 self.__debug("deliverable match"+str(self.rank_const * 4)+" "+self._current_url) self._link_item_edit(self._current_url, rank = self.rank_const * 4) else: # # other word match # # do not multiplied rank constant self.__debug("Rank "+str(self.rank_const - i)+" "+self._current_url) self._link_item_edit(self._current_url, rank = self.rank_const - i) """ Get information about current link """ def _check_anchor(self): # tt is Text and Title tt = self.agent.get_anchor_from_link(self._current_url) # return 0 if no anchor match if tt == 0: return tt; # match for deliverables if re.search(self._sigwords[0], tt, re.I): self.__debug("Anchor matched "+self._current_url) # debug print return 1 """ Returns list of unvisited links. Useful in cycle. """ def _check_unvisited_links(self): unvisitedLinks = [] for link in self._link_stack: if self._link_stack[link][self.VISIT] == 0: # if unvisited unvisitedLinks.append(link) return unvisitedLinks # list of unvisited page links """ Aplying all methods to unvisited links - next level of searching. It is main private method. Only this method can decide end of searching """ def _handle_unvis_links(self): unvisLinks = self._check_unvisited_links() if not unvisLinks: return None # end of searching for link in unvisLinks: # cycle in unvisited links # visit and parse page self._link_item_edit(link, visit = 1) (res, err) = self.agent.ghap(link) if res == -1: self.__debug(str(err)+" "+str(link)) # debug print # if link is broken (IND_FR == 3) if self._link_stack[link][self.IND_FR] != 3: self._repair_links() continue # little hack with error message, there is no error but URL! if res == 2: self.base_url = err # URL of the new base self.__debug("Getting url in ghap(): "+str(link)) # debug print self.__verbose("Searching... URL: "+str(link)) # verbose print self._current_url = link if self._link_stack[link][self.IND_FR] == 2: dname = self.agent.get_domain_name(link) if dname is not None: self.base_url = dname ############### # frame check # self._check_frames() ################ # titles check # self._check_titles() # rank giving here ################ # anchor check # if self._check_anchor(): self._link_item_edit(link, rank = 10) # rank giving here too self._cascade_search() # search for next links on this page # when no unvisited links in list, return return 1 """ Returns link of the highest value of rank in self._link_stack. It is called in the end of process.""" def _get_highest_ranks_link(self): hRank = 0 hLink = "" # check all links and choose link with the highest rank for link in self._link_stack: if self._link_stack[link][self.RANK] > hRank: hLink = link hRank = self._link_stack[link][self.RANK] return hLink # WINNER """ Returns list of all links leading to deliverables. Try to find more sites with deliverables.. i.e. like www.awissenet.com has. Maybe test for name of link - anchor: i.e. next, prev, [0-9]+ and so one... Page usualy looks like: next pages: 1 2 3 4 ... """ def _get_deliv_link_list(self,first_link): # agent gets first_link final_list = [] nonvisited = [first_link] current = nonvisited.pop() while current: if not current or "javascript:" in current or "mailto:" in current: try: current = nonvisited.pop() except: break continue if self.agent.ghap(current)[0] == -1: # CACHE ??? maybe try: current = nonvisited.pop() except: break continue nonvisited.extend(self.agent.get_pager_links(base=current)) final_list.append(current) # append only one link try: current = nonvisited.pop() except: break return final_list # returning all pages with deliverables """ Returns list of links on pages with deliverable-documents. If found returns list, if not found, return -1. Only public method in module. """ def get_deliverable_page(self): # the main searching loop # while we have some unvisited links, search while self._handle_unvis_links(): # security case if len(self._link_stack) > 10: break self.__debug("Stack content: "+str(self._link_stack)) if len(self._link_stack) == 1 : return derrno.__err__(derrno.ELNOTFOUND) final_link = self._get_highest_ranks_link() if not final_link or self._link_stack[final_link][2] == 0: return derrno.__err__(derrno.ELNOTFOUND) self.__debug('#'*79) self.__debug("DELIVERABLE PAGE: "+final_link) return [final_link] ####### not in use ############# result = self._get_deliv_link_list(final_link) if len(result) == 0: return derrno.__err__(derrno.ELNOTFOUND) else: return result
class GetDelivRecords: def __init__(self,verbose=False,debug=False): self.__dbg__ = debug self.__verbos = verbose self._crawler = Crawler() self._crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) self._unwanted_titles = ['Download here','PDF format'] self._records = [] ################################ #manual processing self.agent = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 def __debug(self, msg): _err = "cannot decode debug info." if self.__dbg__ == True: try: print("Debug message: "+str(msg)) except UnicodeError: print(_err) def __verbose(self,msg): _err = "cannot decode debug info." if self.__verbose == True: try: print("Verbose: "+str(msg)) except UnicodeError: print(_err) ########################Processing sequencewrapper output###################### """function gets an entry from output of sequence wrapper it tries to create deliv record and retruns true if succed. """ def _make_deliv_record(self,entry): text = [] links = [] #harvest links and text form entry for e in entry.iter(): if e.text != None: text.append(e.text) if e.attrib.get("link")!=None: if self.agent.is_wanted_mime(e.attrib.get("link")) and e.attrib.get("link") not in links: links.append(e.attrib.get("link")) res = self._deliv_in_text(text,links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") return True elif type(res)==list: res=self._more_entry_in_record(entry) if(res==True): self.__debug("") return True else: return False res = self._deliv_in_link(text,links,entry) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") return True return False """look for keyword in text""" def _deliv_in_text(self,text,links): #print text #print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I) #loop through text in entry looking for title and description for t in text: if _title == False: if pattern.search(t): _title = t #set the longest string as description of deliverable if len(_description)<len(t): _description = t if _title == _description: _description = "" _link = False if type(links) == str: if self.agent.is_wanted_mime(links): _link = links elif type(links) ==list: for l in links: if self.agent.is_wanted_mime(l): if _link == False: _link = l else: #if there was already found link if _link[:s.rfind(_link,'.')] == l[:s.rfind(l,'.')]: break else: return ['-3','Probably more records in one entry'] #create object if _title: #print "TITLE:"+_title pub = RRSPublication(title=_title,abstract=_description) _typ = RRSPublication_type(type='techreport') pub['type'] = _typ self.__debug("*"*40) self.__debug("Title: "+_title) self.__debug("Description: "+_description) if _link: #print "LINK:"+_link self.__debug("Link: "+_link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """look for a key word in link""" def _deliv_in_link(self,text,links,entry = False): ##print text ##print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I) _link = False for l in links: if pattern.search(l): if _link == False: _link =l else: return ['-3','Probably more records in one entry'] #loop through text in entry looking for title and description for t in text: if _title == False: if len(t)>10 : _title = t #set the longest string as description of deliverable if len(_description)<len(t): _description = t if _title == _description: _description = "" #if chosen title is not valid try to find better in parent entry if _title and not self._check_title(_title) and entry != False: _title = self._repair_title(entry) #create object if _link: pub = RRSPublication(title=_title,abstract=_description) typ = RRSPublication_type(type='techreport') pub['type'] = typ self.__debug("*"*40) self.__debug("Title: "+_title) self.__debug("Description: "+_description) self.__debug("Link: "+_link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """Check if title contents only unwanted string with some tolerance return true if title is ok """ def _check_title(self,title,tolerance=10): for t in self._unwanted_titles: if (s.find(s.lower(title),s.lower(t))) != -1: if (len(t)+tolerance) > len(title): return False return True "looks for an element with highest visibility rank in parent elemet" def _repair_title(self,entry): parent = entry.getparent() visibility = 0 title = "" for i in parent.iter(): try: if i.attrib.get('visibility') > visibility: visibility = i.attrib.get('visibility') title = i.text except AttributeError: pass if title != "": return title else: return False "Function try to create array of deliverables from one entry in xml tree" def _more_entry_in_record(self,entry): for ch in entry.iter('chunk'): if ch.text != None and ch.attrib.get("link")!=None: if self.agent.is_wanted_mime(ch.attrib.get("link")): _pub= RRSPublication(title=ch.text) typ = RRSPublication_type(type='techreport') _pub['type'] = typ _l = RRSUrl(link=ch.attrib.get("link")) _rel = RRSRelationshipPublicationUrl() _rel.set_entity(_l) _pub['url'] = _rel self._entriesFoundInLinks.append(_pub) "Process pages definied by urls" def process_pages(self,pages): self._entriesFoundInText = [] self._entriesFoundInLinks = [] self._urls = pages self._pages = self._crawler.start(pages) #creates RRSPublication objects with information about deliverables for u in self._urls: self._wraper.wrap(self._pages[u],u) self._tree = self._wraper.get_etree() #print self._wraper.get_xml() for entry in self._tree.iter("entry"): self._make_deliv_record(entry) if len(self._entriesFoundInText)>3: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks)>3: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks else: self._manual_processing() "This method is called when ther was no records found in output of sequencewrapper" def _manual_processing(self): self._entriesFoundInLinks = [] self._entriesFoundInText = [] self._manual_process_page(self._urls, urlsplit(self._urls[0])[1]) if len(self._entriesFoundInText)>0: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks)>0: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks ########################### TABLE HANDLING METHODS ############################ """ Get texts from element and his descendants. If string isset, returns texts as one string with spaces. # elem: lxml element """ def _get_descendats_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get link from row of table - go through columns and the only href leading to deliverable is returned. """ def _get_row_link(self, row): # find all anchors where parent is row linkanch = row.findall('.//a[@href]') if len(linkanch) == 0: return None for link in linkanch: anchor_link = link.get('href') if self.agent.is_wanted_mime(anchor_link): # check if it is file we want return anchor_link return None """ Handle region as a table. Work with region as it's a table. Try to get table semantic (table order) and get all records out of it. """ def _handle_table(self): for row in self.parentetree: if not row.tag == 'tr': continue row_list = [] _thislink = self._get_row_link(row) if _thislink == None: continue for column in row: text = self._get_descendats_texts(column) if not text: continue row_list.append(text) res = self._deliv_in_text(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") del(row_list) return ######################## TAG SEQUENCE RECOGNIZING METHODS #################### """ Tag check. If it is anchor with href leading to deliverable, returns True """ def _is_deliv_anch(self, tag): if tag.tag == 'a': href = tag.get('href') if self.agent.is_wanted_mime(href): return True return False """ Filters useless and messy tags. Return false if useless, true if normal tag """ def _tagfilter(self, tag): if tag.tag in self._omitted_tags: return False #if tag.text: # if not re.search("[a-z0-9\[\]]", tag.text, re.I): # return False return True """ Gets difference between first two anchors. """ def _getdiff(self, reg, tol): # etree reg = element tree region # int tol: accepted tolerance of tags d = {} index = 0 # fill the dictionary with differences and their occurences for tag in reg.iter(): if not self._tagfilter(tag): continue if self._is_deliv_anch(tag) and not index == 0: try: d[index] += 1 except: d[index] = 1 index = 0 index += 1 # check differencies if the variety isn't higher then $tol tolerance difflist = d.keys() self.__debug("difflist: "+str(difflist)) if len(difflist) == 0: return -1 _max = max(difflist) _min = min(difflist) dlen = len(d.keys()) if dlen == 1: return d.keys()[0] if dlen > ((2*tol)+1): # tolerance to both sides return -1 if (_max - _min) > 2*tol: # some acceptable tolerance return -1 # get the most frequent difference most_freq = max(d.values()) for key in d: if d[key] == most_freq: return key return -1 """ Only anchors found. No optional information. """ def _get_anch_only(self): anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree) # We have to make list of list because XMLOutput return [[anch] for anch in anchlist] """ Main method handling tag sequences and recognizing records. Returns list of records. """ def _get_tag_sequences(self, tag_tol=1): records = [] self._rec = [] if len(self.parentetree) == 0: return [[self.parentetree]] # get interval between anchors, use tolerance tag_tol self.difference = self._getdiff(self.parentetree, self.tagtol) while self.difference == -1: if self.tagtol > 5: self.__verbose("Variety of intervals between anchors is too huge. "+\ "Getting data out of anchors only") return self._get_anch_only() self.tagtol += 1 self.difference = self._getdiff(self.parentetree, self.tagtol) # get sequence of first n tags, where n is average interval between anchors # this could be tag-sequence describing all records in region. self.record_seq = [] i = 0 for tag in self.parentetree.iter(): if not self._tagfilter(tag): continue if i >= self.difference: if not 'a' in self.record_seq: del self.record_seq[0] else: break self.record_seq.append(tag.tag) i += 1 # counter indicates on which position in tag sequence we actually are counter = 0 # make sequence of tags as they go regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()]) recseqlen = len(self.record_seq) reglistlen = len(regionlist) # flag indicating begin of records - in region on the beginning can be some garbage self.begin = False # indicating unpredictable separator between deliverable records self.separator = 0 for i, tag in enumerate(regionlist): # skip and save the sequence at the end if counter > self.difference-1: records.append(self._rec) # save self._rec = [] # erase the list counter = 0 # reset counter if not self.begin: if tag.tag != self.record_seq[0]: continue else: try: if regionlist[i+1].tag != self.record_seq[1]: continue except: pass self.begin = True # handle tolerances, try to compare sibling tags self.match = False # match flag # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence. for tol in range(self.tagtol+1): if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \ regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter += tol+1 break elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \ regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter -= tol counter += 1 break # if nothing matched, its probably out of tolerance if not self.match: self.separator += 1 # tolerance 10 separators (tags between boxes or tables of deliverables) if self.separator > 10: self.__verbose("Tag sequence doesnt match, probably out of "+\ "tolerance, getting data out of anchors only") # maybe here could be tolerance++ # we didnt catch the sequence with tolerance... return self._get_anch_only() records.append(self._rec) return filter(self._validseq, records) """ Helper method - check if sequence of tags rec contains deliv anchor """ def _validseq(self, rec): for _atr in rec: # if we have anchor containing link to document, return true if self._is_deliv_anch(_atr): return True return False """ Get element texts only, dont look for descendants texts """ def _get_tag_content(self, tag): links = [] texts = [] if tag.tag == 'a': href = tag.get('href') # if link leading to document found, add string to list if href is not None and self.agent.is_wanted_mime(href): links.append(self.formatter.format(href)) title = tag.get('title') # if title found in tag, add string to list if title: texts.append(self.formatter.format(title)) # if not anchor, search text in tag.text if tag.text: if re.search("[a-z0-9]", tag.text, re.I): texts.append(self.formatter.format(tag.text)) return [links,texts] """ Harvest texts out of tags and return list of lists (record) """ def _harvest_text(self, record_tag_list): self._records = [] self._rec = [] _links = [] _texts = [] # loop over records and search all possible useful texts for rec_list in record_tag_list: for tag in rec_list: harvested = (self._get_tag_content(tag)) _links.extend(harvested[0]) _texts.extend(harvested[1]) #self._records.append(self._rec) res = self._deliv_in_text(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") _links = [] _texts = [] self._rec = [] return self._records """ Text harvesting for sequences. """ def _handle_sequence(self): seq = self._get_tag_sequences() return self._harvest_text(seq) """ Get records from region according document links this method is used when there was no records found in output of sequencewrapper""" def _manual_process_page(self, links, baseurl): _err = None self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree self.__debug(_err) continue #make all links absolute in parent tree hrefs = self.parentetree.findall('.//a[@href]') for href in hrefs: href.make_links_absolute('http://'+urlsplit(link)[1]+'/') # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset(self.regionHandler.formatter.get_charset()) self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40) self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table','tbody'): self.__verbose("Handling table") self._handle_table() else: self.__verbose("Handling sequences") self._handle_sequence() #############PUBLIC METHODS TO GET RESULTS def get_deliverables_XML(self): """return infromations about deliverables stored in objects as xml""" if len(self.get_deliverables())==0: return derrno.__err__(derrno.ENOREC) output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(self.get_deliverables()) result = output.getvalue() output.close() return result def get_deliverables(self): """return objects containing infromations""" if len(self._records) == 0: return derrno.__err__(derrno.ENOREC) else: return self._records
class GetDeliverableRegion: def __init__(self): # init agent for parsing html self.agent = GetHTMLAndParse() # format text self.formatter = TextFormatUtils() """ Get data region. Returns element tree with region where are deliverables stored """ def get_region(self, url, base, tolerance): _res = self.agent.ghap(url) if len(_res) == 0: return derrno.__err__(errmsg) else: self._page = self.agent.get_etree() deliv_elements = self.agent.find_anchor_elem(base=base) if len(deliv_elements) == 0: return derrno.__err__(derrno.ENODOC, url) if len(deliv_elements) == 1: return lxml.etree.ElementTree(deliv_elements[0]) # get parent tag of all deliverable anchors parent_element = self._get_common_parent(deliv_elements, tolerance) if parent_element == None: return derrno.__err__(derrno.ENOREG, "Parent element not found.") # get the region out of the parent element region = self._get_deliverable_region(parent_element) # if parent tag is region if region == 0: # return element tree made from parent tag return lxml.etree.ElementTree(parent_element) else: print #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True) return region # else return region """ Stabile searching parent of all elements in elem_list using method of making element parent vectors and comparing them. Tolerance of n tags makes the region smaller if there are >>not deliverable<< pdfs in more regions on the page.""" def _get_common_parent(self, elem_list, tolerance): # supporting method - kind of bigger lambda. Get minimal length of # inside lists. def _minlength(seq_list): return min([len(seq) for seq in seq_list]) # next supporting method: check the elements in list. # if elements are the same, its common parent tag - return True. def _iscommon(elem_seq, tol): tol_list = [] for elem in elem_seq: if not elem in tol_list: tol_list.append(elem) if len(tol_list) > tol+1: return False # if only two anchors found then we have only two tags # and its pretty hard to use tolerance, so we omit it. if len(elem_seq) < 3 and len(tol_list) > 1: return False return True # get the most frequenced tag in list def _most_frequent(seq): suplist = [] suplist_freq = [] for el in seq: if not el in suplist: suplist.append(el) suplist_freq.append(int(1)) else: suplist_freq[suplist.index(el)] += 1 ind = suplist_freq.index(max(suplist_freq)) return suplist[ind] # # now continue with method _get_common_parent() # vectors = [] # here will be vectors stored - list of lists for self.elem in elem_list: _vector = [] while 1: parent = self.elem.getparent() # exception possible here if parent == None: break _vector.append(parent) self.elem = parent vectors.append(_vector) # We have parent vectors of all elements from elem_list stored in list # $vectors. Then zip the vector list and get sequences of parent tags (and the # other tags) sorted from the highest to the lowest parent element. zipped = [[row[-i] for row in vectors] for i in range(1, _minlength(vectors)+1)] # now check all lists in list zipped. If these are filled with the same # elements, its a common parent. The last list before difference contains # the main parent tag. self.last_seq = [] for zipvect in zipped: if not _iscommon(zipvect, tolerance): # return most frequented element in last vector return _most_frequent(self.last_seq) self.last_seq = zipvect return _most_frequent(self.last_seq) """ Get texts from element and his descendants. If string is True, returns texts as one string with spaces. elem: lxml element """ def _get_element_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get deliverable region - returns etree with region. If 0 returned parent_tag is region, if -1 returned some error occured searching, if html string returned its a region. """ def _get_deliverable_region(self, parent_tag): def _convert_tag_to_html(tag): tag_html = lxml.etree.ElementTree(tag) return lxml.etree.tostring(tag_html) # list[0] = type, list[1] = atribute, list[2] = lxml tag element # in case of headers list[0] = element.tag, then [2] is element _reg_atr = ['',None,None] self._result_html_region = '' reg_flag = False # flag indicating that we are looping over region # get headers first headers = [] #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True) for i in range(1,7): headers.extend(parent_tag.findall('.//h'+str(i))) children = parent_tag.getchildren() if len(headers) > 0: for head in headers: text = self._get_element_texts(head) if text: if re.search("deliverables", text, re.I): _reg_atr[0] = head.tag _reg_atr[2] = head break if _reg_atr[2] == None: return 0 # visit all tag in parent_tag for tag in parent_tag.iterdescendants(): if tag.tag == 'img': continue; text = self._get_element_texts(tag) if tag.tag == 'a' and not tag.text: if tag.find('img') is not None: text = tag.find('img').tail else: text = ' ' if text: if re.search("deliverables", text, re.I) and \ tag.tag == _reg_atr[0]: # "deliverable" title, BEGIN of region reg_flag = True elif not re.search("deliverables", text, re.I) and \ tag.tag == _reg_atr[0]: # next similar title, END of region if reg_flag: break # region content if tag in children and reg_flag: self._result_html_region += _convert_tag_to_html(tag) elif tag.getparent() in children and reg_flag: self._result_html_region+=_convert_tag_to_html(tag.getparent()) children.remove(tag.getparent()) # if we dont have headers, try to find other kind of header (title) # "Deliverables" and compare with other elements with the same class or id. else: for tag in parent_tag.iter(): if tag.text: if re.search("deliverables", tag.text, re.I): if tag.get("class"): _reg_atr[0] = 'class' _reg_atr[1] = tag.get("class") _reg_atr[2] = tag break elif tag.get("id"): _reg_atr[0] = 'id' _reg_atr[1] = tag.get("id") _reg_atr[2] = tag break elif tag.get("style"): _reg_atr[0] = 'style' _reg_atr[1] = tag.get("style") _reg_atr[2] = tag break # test _reg_atr. If there is no deliverable region, then all # documents make the region if _reg_atr[2] == None: return 0 reg_flag = False # visit all tag in parent_tag for tag in parent_tag.iterdescendants(): if tag.tag == 'a' and not tag.text: if tag.find('img') is not None: tag.text = tag.find('img').tail else: tag.text = ' ' if tag.text: if re.search("deliverables", tag.text, re.I) and \ tag.get(_reg_atr[0]) == _reg_atr[1]: # "deliverable" title, BEGIN of region reg_flag = True elif not re.search("deliverables", tag.text, re.I) and \ tag.get(_reg_atr[0]) == _reg_atr[1]: # next similar title, END of region if reg_flag: break # region content if tag in children and reg_flag: self._result_html_region += _convert_tag_to_html(tag) children.remove(tag) elif tag.getparent() in children and reg_flag: self._result_html_region+=_convert_tag_to_html(tag.getparent()) children.remove(tag.getparent()) if not self._result_html_region: return 0 # create ElementTree from region try: return lxml.etree.fromstring(self._result_html_region) except: try: parser = lxml.etree.HTMLParser() return lxml.etree.fromstring(self._result_html_region, parser) except lxml.etree.XMLSyntaxError: return 0
class GetDeliverableRecords: def __init__(self, verbose=False, debug=False): # init agent for parsing html self.htmlHandler = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() # list of acceptable words in title (header) of table self.table_sem_words = [ 'deliverable', 'description', 'name', 'date', 'dissemination', 'no.', 'wp', 'delivery', 'particip', 'title', 'nature' ] self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 # verbose and debug flags self.debugger = DeliverableDebugger(verbose=verbose, debug=debug) self.__verbose = self.debugger.verbose self.__debug = self.debugger.debug ########################### TABLE HANDLING METHODS ############################ """ Get texts from element and his descendants. If string isset, returns texts as one string with spaces. # elem: lxml element """ def _get_descendats_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get table order (table semantic) """ def _get_table_order(self): sem_list = [] for desc in self.parentetree.iterdescendants(): if desc.tag == 'tr': # first <tr> match for col in desc: # its <th> or <td> for child in col.iterdescendants(): if child.tag == 'a': if self.htmlHandler.check_file(child.get('href')): return None value = self._get_descendats_texts(col) if value != None: # if it is not title, but some text. if len(value) > 30: return None sem_list.append(value) break str_sem_list = " ".join(sem_list) for expr in self.table_sem_words: # two matches ??? if re.search(expr, str_sem_list, re.I): return sem_list return None """ Get link from row of table - go through columns and the only href leading to deliverable is returned. """ def _get_row_link(self, row): # find all anchors where parent is row linkanch = row.findall('.//a[@href]') if len(linkanch) == 0: return None for link in linkanch: anchor_link = link.get('href') if self.htmlHandler.check_file( anchor_link): # check if it is file we want return anchor_link return None """ Handle region as a table. Work with region as it's a table. Try to get table semantic (table order) and get all records out of it. """ # # # # # # # # # # # def _handle_table(self): # get table semantic tbl_order = self._get_table_order() # if we didnt recognize table order, get records and return list if not tbl_order: self.__verbose("Table order not recognized, getting data...") records = [] # tr tag is a record for row in self.parentetree: if not row.tag == 'tr': continue row_list = [] _thislink = self._get_row_link(row) if _thislink == None: continue row_list.append(_thislink) for column in row: text = self._get_descendats_texts(column) if not text: continue row_list.append(text) records.append(row_list) del (row_list) return records # else we have recognized table order, make dict of dicts out of it else: self.__verbose( "Table order recognized, filling dictionary in this order.") # every column of the row (every atribute of the record) has it's own # semantic in order of table semantic semantic_data = dict() for row in self.parentetree: self._thislink = self._get_row_link(row) # if its header or non-deliverable row, omit it. if self._thislink == None: continue semantic_data[self._thislink] = {} for index, column in enumerate(row): # get column text text = self._get_descendats_texts(column) if not text: continue try: # store it semantic_data[self._thislink][tbl_order[index]] = text except: continue return semantic_data ######################## TAG SEQUENCE RECOGNIZING METHODS #################### """ Tag check. If it is anchor with href leading to deliverable, returns True """ def _is_deliv_anch(self, tag): if tag.tag == 'a': href = tag.get('href') if self.htmlHandler.check_file(href): return True return False """ Filters useless and messy tags. Return false if useless, true if normal tag """ def _tagfilter(self, tag): if tag.tag in self._omitted_tags: return False #if tag.text: # if not re.search("[a-z0-9\[\]]", tag.text, re.I): # return False return True """ Gets difference between first two anchors. """ def _getdiff(self, reg, tol): # etree reg = element tree region # int tol: accepted tolerance of tags d = {} index = 0 # fill the dictionary with differences and their occurences for tag in reg.iter(): if not self._tagfilter(tag): continue if self._is_deliv_anch(tag) and not index == 0: try: d[index] += 1 except: d[index] = 1 index = 0 index += 1 # check differencies if the variety isn't higher then $tol tolerance difflist = d.keys() self.__debug("difflist: " + str(difflist)) if len(difflist) == 0: return -1 _max = max(difflist) _min = min(difflist) dlen = len(d.keys()) if dlen == 1: return d.keys()[0] if dlen > ((2 * tol) + 1): # tolerance to both sides return -1 if (_max - _min) > 2 * tol: # some acceptable tolerance return -1 # get the most frequent difference most_freq = max(d.values()) for key in d: if d[key] == most_freq: return key return -1 """ Only anchors found. No optional information. """ def _get_anch_only(self): anchlist = self.htmlHandler.find_anchor_elem(self.baseUrl, self.parentetree) # We have to make list of list because XMLOutput return [[anch] for anch in anchlist] """ Main method handling tag sequences and recognizing records. Returns list of records. """ def _get_tag_sequences(self, tag_tol=1): records = [] self._rec = [] if len(self.parentetree) == 0: return [[self.parentetree]] # get interval between anchors, use tolerance tag_tol self.difference = self._getdiff(self.parentetree, self.tagtol) while self.difference == -1: if self.tagtol > 5: self.__verbose("Variety of intervals between anchors is too huge. "+\ "Getting data out of anchors only") return self._get_anch_only() self.tagtol += 1 self.difference = self._getdiff(self.parentetree, self.tagtol) # get sequence of first n tags, where n is average interval between anchors # this could be tag-sequence describing all records in region. self.record_seq = [] i = 0 for tag in self.parentetree.iter(): if not self._tagfilter(tag): continue if i >= self.difference: if not 'a' in self.record_seq: del self.record_seq[0] else: break self.record_seq.append(tag.tag) i += 1 # counter indicates on which position in tag sequence we actually are counter = 0 # make sequence of tags as they go regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()]) recseqlen = len(self.record_seq) reglistlen = len(regionlist) # flag indicating begin of records - in region on the beginning can be some garbage self.begin = False # indicating unpredictable separator between deliverable records self.separator = 0 for i, tag in enumerate(regionlist): # skip and save the sequence at the end if counter > self.difference - 1: records.append(self._rec) # save self._rec = [] # erase the list counter = 0 # reset counter if not self.begin: if tag.tag != self.record_seq[0]: continue else: try: if regionlist[i + 1].tag != self.record_seq[1]: continue except: pass self.begin = True # handle tolerances, try to compare sibling tags self.match = False # match flag # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence. for tol in range(self.tagtol + 1): if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \ regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter += tol + 1 break elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \ regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter -= tol counter += 1 break # if nothing matched, its probably out of tolerance if not self.match: self.separator += 1 # tolerance 10 separators (tags between boxes or tables of deliverables) if self.separator > 10: self.__verbose("Tag sequence doesnt match, probably out of "+\ "tolerance, getting data out of anchors only") # maybe here could be tolerance++ # we didnt catch the sequence with tolerance... return self._get_anch_only() records.append(self._rec) return filter(self._validseq, records) """ Helper method - check if sequence of tags rec contains deliv anchor """ def _validseq(self, rec): for _atr in rec: # if we have anchor containing link to document, return true if self._is_deliv_anch(_atr): return True return False ### # # # # # # # # # # # """ Get element texts only, dont look for descendants texts """ def _get_tag_content(self, tag): l = [] if tag.tag == 'a': href = tag.get('href') # if link leading to document found, add string to list if href is not None and self.htmlHandler.check_file(href): l.append(self.formatter.format(href)) title = tag.get('title') # if title found in tag, add string to list if title: l.append(self.formatter.format(title)) # if not anchor, search text in tag.text if tag.text: if re.search("[a-z0-9]", tag.text, re.I): l.append(self.formatter.format(tag.text)) return l """ Harvest texts out of tags and return list of lists (record) """ def _harvest_text(self, record_tag_list): self._records = [] self._rec = [] # loop over records and search all possible useful texts for rec_list in record_tag_list: for tag in rec_list: self._rec.extend(self._get_tag_content(tag)) self._records.append(self._rec) self._rec = [] return self._records """ Text harvesting for sequences. """ def _handle_sequence(self): seq = self._get_tag_sequences() return self._harvest_text(seq) ############################ OVERALL METHODS ################################ """ Get records from region according document links """ def _manual_process_page(self, links, baseurl): _err = None recordlist = [] self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree continue # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset( self.regionHandler.formatter.get_charset()) self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " + "*" * 40) self.__debug( lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table', 'tbody'): self.__verbose("Handling table") _result = self._handle_table() # if we had a dictionary, continue filling it if len(recordlist) > 0: for key in _result: recordlist[key] = _result[key] else: recordlist = _result # Parent tag is not table # call _handle_sequence else: self.__verbose("Handling sequences") _result = self._handle_sequence() recordlist.extend(_result) # no records found if len(recordlist) == 0: if not _err == None: return _err return derrno.__err__(derrno.ENOREC) self.__debug("DATA RECORDS: ") self.__debug(recordlist) return recordlist # returns list of records
class GetDeliverableRecords: """ get records and return dict or list of records with atributes """ def __init__(self, verbose=False,debug=False): # init agent for parsing html self.htmlHandler = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() # list of acceptable words in title (header) of table self.table_sem_words = ['deliverable', 'description', 'name', 'date', 'dissemination', 'no.', 'wp', 'delivery', 'particip', 'title', 'nature'] self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 # verbose and debug flags self.debugger = DeliverableDebugger(verbose = verbose,debug = debug) self.__verbose = self.debugger.verbose self.__debug = self.debugger.debug ########################### TABLE HANDLING METHODS ############################ def _get_descendats_texts(self, elem, string=True): """ Get texts from element and his descendants. If string isset, returns texts as one string with spaces. # elem: lxml element """ texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts def _get_table_order(self): """ Get table order (table semantic) """ sem_list = [] for desc in self.parentetree.iterdescendants(): if desc.tag == 'tr': # first <tr> match for col in desc: # its <th> or <td> for child in col.iterdescendants(): if child.tag == 'a': if self.htmlHandler.check_file(child.get('href')): return None value = self._get_descendats_texts(col) if value != None: # if it is not title, but some text. if len(value) > 30: return None sem_list.append(value) break str_sem_list = " ".join(sem_list) for expr in self.table_sem_words: # two matches ??? if re.search(expr, str_sem_list, re.I): return sem_list return None def _get_row_link(self, row): """ Get link from row of table - go through columns and the only href leading to deliverable is returned. """ # find all anchors where parent is row linkanch = row.findall('.//a[@href]') if len(linkanch) == 0: return None for link in linkanch: anchor_link = link.get('href') if self.htmlHandler.check_file(anchor_link): # check if it is file we want return anchor_link return None def _handle_table(self): """ Handle region as a table. Work with region as it's a table. Try to get table semantic (table order) and get all records out of it. """ # get table semantic tbl_order = self._get_table_order() # if we didnt recognize table order, get records and return list if not tbl_order: self.__verbose("Table order not recognized, getting data...") records = [] # tr tag is a record for row in self.parentetree: if not row.tag == 'tr': continue row_list = [] _thislink = self._get_row_link(row) if _thislink == None: continue row_list.append(_thislink) for column in row: text = self._get_descendats_texts(column) if not text: continue row_list.append(text) records.append(row_list) del(row_list) return records # else we have recognized table order, make dict of dicts out of it else: self.__verbose("Table order recognized, filling dictionary in this order.") # every column of the row (every atribute of the record) has it's own # semantic in order of table semantic semantic_data = dict() for row in self.parentetree: self._thislink = self._get_row_link(row) # if its header or non-deliverable row, omit it. if self._thislink == None: continue semantic_data[self._thislink] = {} for index, column in enumerate(row): # get column text text = self._get_descendats_texts(column) if not text: continue try: # store it semantic_data[self._thislink][tbl_order[index]] = text except: continue return semantic_data ######################## TAG SEQUENCE RECOGNIZING METHODS #################### def _is_deliv_anch(self, tag): """ Tag check. If it is anchor with href leading to deliverable, returns True """ if tag.tag == 'a': href = tag.get('href') if self.htmlHandler.check_file(href): return True return False def _tagfilter(self, tag): """ Filters useless and messy tags. Return false if useless, true if normal tag """ if tag.tag in self._omitted_tags: return False #if tag.text: # if not re.search("[a-z0-9\[\]]", tag.text, re.I): # return False return True def _getdiff(self, reg, tol): """ Gets difference between first two anchors. """ # etree reg = element tree region # int tol: accepted tolerance of tags d = {} index = 0 # fill the dictionary with differences and their occurences for tag in reg.iter(): if not self._tagfilter(tag): continue if self._is_deliv_anch(tag) and not index == 0: try: d[index] += 1 except: d[index] = 1 index = 0 index += 1 # check differencies if the variety isn't higher then $tol tolerance difflist = d.keys() self.__debug("difflist: "+str(difflist)) if len(difflist) == 0: return -1 _max = max(difflist) _min = min(difflist) dlen = len(d.keys()) if dlen == 1: return d.keys()[0] if dlen > ((2*tol)+1): # tolerance to both sides return -1 if (_max - _min) > 2*tol: # some acceptable tolerance return -1 # get the most frequent difference most_freq = max(d.values()) for key in d: if d[key] == most_freq: return key return -1 def _get_anch_only(self): """ Only anchors found. No optional information. """ anchlist = self.htmlHandler.find_anchor_elem(self.baseUrl, self.parentetree) # We have to make list of list because XMLOutput return [[anch] for anch in anchlist] def _get_tag_sequences(self, tag_tol=1): """ Main method handling tag sequences and recognizing records. returns list of records. """ records = [] self._rec = [] if len(self.parentetree) == 0: return [[self.parentetree]] # get interval between anchors, use tolerance tag_tol self.difference = self._getdiff(self.parentetree, self.tagtol) while self.difference == -1: if self.tagtol > 5: self.__verbose("Variety of intervals between anchors is too huge. "+\ "Getting data out of anchors only") return self._get_anch_only() self.tagtol += 1 self.difference = self._getdiff(self.parentetree, self.tagtol) # get sequence of first n tags, where n is average interval between anchors # this could be tag-sequence describing all records in region. self.record_seq = [] i = 0 for tag in self.parentetree.iter(): if not self._tagfilter(tag): continue if i >= self.difference: if not 'a' in self.record_seq: del self.record_seq[0] else: break self.record_seq.append(tag.tag) i += 1 # counter indicates on which position in tag sequence we actually are counter = 0 # make sequence of tags as they go regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()]) recseqlen = len(self.record_seq) reglistlen = len(regionlist) # flag indicating begin of records - in region on the beginning can be some garbage self.begin = False # indicating unpredictable separator between deliverable records self.separator = 0 for i, tag in enumerate(regionlist): # skip and save the sequence at the end if counter > self.difference-1: records.append(self._rec) # save self._rec = [] # erase the list counter = 0 # reset counter if not self.begin: if tag.tag != self.record_seq[0]: continue else: try: if regionlist[i+1].tag != self.record_seq[1]: continue except: pass self.begin = True # handle tolerances, try to compare sibling tags self.match = False # match flag # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence. for tol in range(self.tagtol+1): if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \ regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter += tol+1 break elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \ regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter -= tol counter += 1 break # if nothing matched, its probably out of tolerance if not self.match: self.separator += 1 # tolerance 10 separators (tags between boxes or tables of deliverables) if self.separator > 10: self.__verbose("Tag sequence doesnt match, probably out of "+\ "tolerance, getting data out of anchors only") # maybe here could be tolerance++ # we didnt catch the sequence with tolerance... return self._get_anch_only() records.append(self._rec) return filter(self._validseq, records) def _validseq(self, rec): """ Helper method - check if sequence of tags rec contains deliv anchor """ for _atr in rec: # if we have anchor containing link to document, return true if self._is_deliv_anch(_atr): return True return False def _get_tag_content(self, tag): """ Get element texts only, dont look for descendants texts """ l = [] if tag.tag == 'a': href = tag.get('href') # if link leading to document found, add string to list if href is not None and self.htmlHandler.check_file(href): l.append(self.formatter.format(href)) title = tag.get('title') # if title found in tag, add string to list if title: l.append(self.formatter.format(title)) # if not anchor, search text in tag.text if tag.text: if re.search("[a-z0-9]", tag.text, re.I): l.append(self.formatter.format(tag.text)) return l def _harvest_text(self, record_tag_list): """ Harvest texts out of tags and return list of lists (record) """ self._records = [] self._rec = [] # loop over records and search all possible useful texts for rec_list in record_tag_list: for tag in rec_list: self._rec.extend(self._get_tag_content(tag)) self._records.append(self._rec) self._rec = [] return self._records def _handle_sequence(self): """ Text harvesting for sequences. """ seq = self._get_tag_sequences() return self._harvest_text(seq) ############################ OVERALL METHODS ################################ def _manual_process_page(self, links, baseurl): """ Get records from region according document links """ _err = None recordlist = [] self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree continue # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset(self.regionHandler.formatter.get_charset()) self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40) self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table','tbody'): self.__verbose("Handling table") _result = self._handle_table() # if we had a dictionary, continue filling it if len(recordlist) > 0: for key in _result: recordlist[key] = _result[key] else: recordlist = _result # Parent tag is not table # call _handle_sequence else: self.__verbose("Handling sequences") _result = self._handle_sequence() recordlist.extend(_result) # no records found if len(recordlist) == 0: if not _err == None: return _err return derrno.__err__(derrno.ENOREC) self.__debug("DATA RECORDS: ") self.__debug(recordlist) return recordlist # returns list of records