class GetDelivRecords: def __init__(self, verbose=False, debug=False): self.__dbg__ = debug self.__verbos = verbose self._crawler = Crawler() self._crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) self._unwanted_titles = ['Download here', 'PDF format'] self._records = [] ################################ #manual processing self.agent = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 def __debug(self, msg): _err = "cannot decode debug info." if self.__dbg__ == True: try: print("Debug message: " + str(msg)) except UnicodeError: print(_err) def __verbose(self, msg): _err = "cannot decode debug info." if self.__verbose == True: try: print("Verbose: " + str(msg)) except UnicodeError: print(_err) ########################Processing sequencewrapper output###################### """function gets an entry from output of sequence wrapper it tries to create deliv record and retruns true if succed. """ def _make_deliv_record(self, entry): text = [] links = [] #harvest links and text form entry for e in entry.iter(): if e.text != None: text.append(e.text) if e.attrib.get("link") != None: if self.agent.is_wanted_mime(e.attrib.get( "link")) and e.attrib.get("link") not in links: links.append(e.attrib.get("link")) res = self._deliv_in_text(text, links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") return True elif type(res) == list: res = self._more_entry_in_record(entry) if (res == True): self.__debug("") return True else: return False res = self._deliv_in_link(text, links, entry) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") return True return False """look for keyword in text""" def _deliv_in_text(self, text, links): #print text #print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)", re.I) #loop through text in entry looking for title and description for t in text: if _title == False: if pattern.search(t): _title = t #set the longest string as description of deliverable if len(_description) < len(t): _description = t if _title == _description: _description = "" _link = False if type(links) == str: if self.agent.is_wanted_mime(links): _link = links elif type(links) == list: for l in links: if self.agent.is_wanted_mime(l): if _link == False: _link = l else: #if there was already found link if _link[:s.rfind(_link, '.')] == l[:s.rfind(l, '.')]: break else: return ['-3', 'Probably more records in one entry'] #create object if _title: #print "TITLE:"+_title pub = RRSPublication(title=_title, abstract=_description) _typ = RRSPublication_type(type='techreport') pub['type'] = _typ self.__debug("*" * 40) self.__debug("Title: " + _title) self.__debug("Description: " + _description) if _link: #print "LINK:"+_link self.__debug("Link: " + _link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """look for a key word in link""" def _deliv_in_link(self, text, links, entry=False): ##print text ##print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)", re.I) _link = False for l in links: if pattern.search(l): if _link == False: _link = l else: return ['-3', 'Probably more records in one entry'] #loop through text in entry looking for title and description for t in text: if _title == False: if len(t) > 10: _title = t #set the longest string as description of deliverable if len(_description) < len(t): _description = t if _title == _description: _description = "" #if chosen title is not valid try to find better in parent entry if _title and not self._check_title(_title) and entry != False: _title = self._repair_title(entry) #create object if _link: pub = RRSPublication(title=_title, abstract=_description) typ = RRSPublication_type(type='techreport') pub['type'] = typ self.__debug("*" * 40) self.__debug("Title: " + _title) self.__debug("Description: " + _description) self.__debug("Link: " + _link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """Check if title contents only unwanted string with some tolerance return true if title is ok """ def _check_title(self, title, tolerance=10): for t in self._unwanted_titles: if (s.find(s.lower(title), s.lower(t))) != -1: if (len(t) + tolerance) > len(title): return False return True "looks for an element with highest visibility rank in parent elemet" def _repair_title(self, entry): parent = entry.getparent() visibility = 0 title = "" for i in parent.iter(): try: if i.attrib.get('visibility') > visibility: visibility = i.attrib.get('visibility') title = i.text except AttributeError: pass if title != "": return title else: return False "Function try to create array of deliverables from one entry in xml tree" def _more_entry_in_record(self, entry): for ch in entry.iter('chunk'): if ch.text != None and ch.attrib.get("link") != None: if self.agent.is_wanted_mime(ch.attrib.get("link")): _pub = RRSPublication(title=ch.text) typ = RRSPublication_type(type='techreport') _pub['type'] = typ _l = RRSUrl(link=ch.attrib.get("link")) _rel = RRSRelationshipPublicationUrl() _rel.set_entity(_l) _pub['url'] = _rel self._entriesFoundInLinks.append(_pub) "Process pages definied by urls" def process_pages(self, pages): self._entriesFoundInText = [] self._entriesFoundInLinks = [] self._urls = pages self._pages = self._crawler.start(pages) #creates RRSPublication objects with information about deliverables for u in self._urls: self._wraper.wrap(self._pages[u], u) self._tree = self._wraper.get_etree() #print self._wraper.get_xml() for entry in self._tree.iter("entry"): self._make_deliv_record(entry) if len(self._entriesFoundInText) > 3: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks) > 3: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks else: self._manual_processing() "This method is called when ther was no records found in output of sequencewrapper" def _manual_processing(self): self._entriesFoundInLinks = [] self._entriesFoundInText = [] self._manual_process_page(self._urls, urlsplit(self._urls[0])[1]) if len(self._entriesFoundInText) > 0: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks) > 0: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks ########################### TABLE HANDLING METHODS ############################ """ Get texts from element and his descendants. If string isset, returns texts as one string with spaces. # elem: lxml element """ def _get_descendats_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get link from row of table - go through columns and the only href leading to deliverable is returned. """ def _get_row_link(self, row): # find all anchors where parent is row linkanch = row.findall('.//a[@href]') if len(linkanch) == 0: return None for link in linkanch: anchor_link = link.get('href') if self.agent.is_wanted_mime( anchor_link): # check if it is file we want return anchor_link return None """ Handle region as a table. Work with region as it's a table. Try to get table semantic (table order) and get all records out of it. """ def _handle_table(self): for row in self.parentetree: if not row.tag == 'tr': continue row_list = [] _thislink = self._get_row_link(row) if _thislink == None: continue for column in row: text = self._get_descendats_texts(column) if not text: continue row_list.append(text) res = self._deliv_in_text(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") del (row_list) return ######################## TAG SEQUENCE RECOGNIZING METHODS #################### """ Tag check. If it is anchor with href leading to deliverable, returns True """ def _is_deliv_anch(self, tag): if tag.tag == 'a': href = tag.get('href') if self.agent.is_wanted_mime(href): return True return False """ Filters useless and messy tags. Return false if useless, true if normal tag """ def _tagfilter(self, tag): if tag.tag in self._omitted_tags: return False #if tag.text: # if not re.search("[a-z0-9\[\]]", tag.text, re.I): # return False return True """ Gets difference between first two anchors. """ def _getdiff(self, reg, tol): # etree reg = element tree region # int tol: accepted tolerance of tags d = {} index = 0 # fill the dictionary with differences and their occurences for tag in reg.iter(): if not self._tagfilter(tag): continue if self._is_deliv_anch(tag) and not index == 0: try: d[index] += 1 except: d[index] = 1 index = 0 index += 1 # check differencies if the variety isn't higher then $tol tolerance difflist = d.keys() self.__debug("difflist: " + str(difflist)) if len(difflist) == 0: return -1 _max = max(difflist) _min = min(difflist) dlen = len(d.keys()) if dlen == 1: return d.keys()[0] if dlen > ((2 * tol) + 1): # tolerance to both sides return -1 if (_max - _min) > 2 * tol: # some acceptable tolerance return -1 # get the most frequent difference most_freq = max(d.values()) for key in d: if d[key] == most_freq: return key return -1 """ Only anchors found. No optional information. """ def _get_anch_only(self): anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree) # We have to make list of list because XMLOutput return [[anch] for anch in anchlist] """ Main method handling tag sequences and recognizing records. Returns list of records. """ def _get_tag_sequences(self, tag_tol=1): records = [] self._rec = [] if len(self.parentetree) == 0: return [[self.parentetree]] # get interval between anchors, use tolerance tag_tol self.difference = self._getdiff(self.parentetree, self.tagtol) while self.difference == -1: if self.tagtol > 5: self.__verbose("Variety of intervals between anchors is too huge. "+\ "Getting data out of anchors only") return self._get_anch_only() self.tagtol += 1 self.difference = self._getdiff(self.parentetree, self.tagtol) # get sequence of first n tags, where n is average interval between anchors # this could be tag-sequence describing all records in region. self.record_seq = [] i = 0 for tag in self.parentetree.iter(): if not self._tagfilter(tag): continue if i >= self.difference: if not 'a' in self.record_seq: del self.record_seq[0] else: break self.record_seq.append(tag.tag) i += 1 # counter indicates on which position in tag sequence we actually are counter = 0 # make sequence of tags as they go regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()]) recseqlen = len(self.record_seq) reglistlen = len(regionlist) # flag indicating begin of records - in region on the beginning can be some garbage self.begin = False # indicating unpredictable separator between deliverable records self.separator = 0 for i, tag in enumerate(regionlist): # skip and save the sequence at the end if counter > self.difference - 1: records.append(self._rec) # save self._rec = [] # erase the list counter = 0 # reset counter if not self.begin: if tag.tag != self.record_seq[0]: continue else: try: if regionlist[i + 1].tag != self.record_seq[1]: continue except: pass self.begin = True # handle tolerances, try to compare sibling tags self.match = False # match flag # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence. for tol in range(self.tagtol + 1): if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \ regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter += tol + 1 break elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \ regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter -= tol counter += 1 break # if nothing matched, its probably out of tolerance if not self.match: self.separator += 1 # tolerance 10 separators (tags between boxes or tables of deliverables) if self.separator > 10: self.__verbose("Tag sequence doesnt match, probably out of "+\ "tolerance, getting data out of anchors only") # maybe here could be tolerance++ # we didnt catch the sequence with tolerance... return self._get_anch_only() records.append(self._rec) return filter(self._validseq, records) """ Helper method - check if sequence of tags rec contains deliv anchor """ def _validseq(self, rec): for _atr in rec: # if we have anchor containing link to document, return true if self._is_deliv_anch(_atr): return True return False """ Get element texts only, dont look for descendants texts """ def _get_tag_content(self, tag): links = [] texts = [] if tag.tag == 'a': href = tag.get('href') # if link leading to document found, add string to list if href is not None and self.agent.is_wanted_mime(href): links.append(self.formatter.format(href)) title = tag.get('title') # if title found in tag, add string to list if title: texts.append(self.formatter.format(title)) # if not anchor, search text in tag.text if tag.text: if re.search("[a-z0-9]", tag.text, re.I): texts.append(self.formatter.format(tag.text)) return [links, texts] """ Harvest texts out of tags and return list of lists (record) """ def _harvest_text(self, record_tag_list): self._records = [] self._rec = [] _links = [] _texts = [] # loop over records and search all possible useful texts for rec_list in record_tag_list: for tag in rec_list: harvested = (self._get_tag_content(tag)) _links.extend(harvested[0]) _texts.extend(harvested[1]) #self._records.append(self._rec) res = self._deliv_in_text(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") _links = [] _texts = [] self._rec = [] return self._records """ Text harvesting for sequences. """ def _handle_sequence(self): seq = self._get_tag_sequences() return self._harvest_text(seq) """ Get records from region according document links this method is used when there was no records found in output of sequencewrapper""" def _manual_process_page(self, links, baseurl): _err = None self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree self.__debug(_err) continue #make all links absolute in parent tree hrefs = self.parentetree.findall('.//a[@href]') for href in hrefs: href.make_links_absolute('http://' + urlsplit(link)[1] + '/') # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset( self.regionHandler.formatter.get_charset()) self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " + "*" * 40) self.__debug( lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table', 'tbody'): self.__verbose("Handling table") self._handle_table() else: self.__verbose("Handling sequences") self._handle_sequence() #############PUBLIC METHODS TO GET RESULTS def get_deliverables_XML(self): """return infromations about deliverables stored in objects as xml""" if len(self.get_deliverables()) == 0: return derrno.__err__(derrno.ENOREC) output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(self.get_deliverables()) result = output.getvalue() output.close() return result def get_deliverables(self): """return objects containing infromations""" if len(self._records) == 0: return derrno.__err__(derrno.ENOREC) else: return self._records
class GetDelivRecords: def __init__(self,verbose=False,debug=False): self.__dbg__ = debug self.__verbos = verbose self._crawler = Crawler() self._crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) self._unwanted_titles = ['Download here','PDF format'] self._records = [] ################################ #manual processing self.agent = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 def __debug(self, msg): _err = "cannot decode debug info." if self.__dbg__ == True: try: print("Debug message: "+str(msg)) except UnicodeError: print(_err) def __verbose(self,msg): _err = "cannot decode debug info." if self.__verbose == True: try: print("Verbose: "+str(msg)) except UnicodeError: print(_err) ########################Processing sequencewrapper output###################### """function gets an entry from output of sequence wrapper it tries to create deliv record and retruns true if succed. """ def _make_deliv_record(self,entry): text = [] links = [] #harvest links and text form entry for e in entry.iter(): if e.text != None: text.append(e.text) if e.attrib.get("link")!=None: if self.agent.is_wanted_mime(e.attrib.get("link")) and e.attrib.get("link") not in links: links.append(e.attrib.get("link")) res = self._deliv_in_text(text,links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") return True elif type(res)==list: res=self._more_entry_in_record(entry) if(res==True): self.__debug("") return True else: return False res = self._deliv_in_link(text,links,entry) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") return True return False """look for keyword in text""" def _deliv_in_text(self,text,links): #print text #print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I) #loop through text in entry looking for title and description for t in text: if _title == False: if pattern.search(t): _title = t #set the longest string as description of deliverable if len(_description)<len(t): _description = t if _title == _description: _description = "" _link = False if type(links) == str: if self.agent.is_wanted_mime(links): _link = links elif type(links) ==list: for l in links: if self.agent.is_wanted_mime(l): if _link == False: _link = l else: #if there was already found link if _link[:s.rfind(_link,'.')] == l[:s.rfind(l,'.')]: break else: return ['-3','Probably more records in one entry'] #create object if _title: #print "TITLE:"+_title pub = RRSPublication(title=_title,abstract=_description) _typ = RRSPublication_type(type='techreport') pub['type'] = _typ self.__debug("*"*40) self.__debug("Title: "+_title) self.__debug("Description: "+_description) if _link: #print "LINK:"+_link self.__debug("Link: "+_link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """look for a key word in link""" def _deliv_in_link(self,text,links,entry = False): ##print text ##print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I) _link = False for l in links: if pattern.search(l): if _link == False: _link =l else: return ['-3','Probably more records in one entry'] #loop through text in entry looking for title and description for t in text: if _title == False: if len(t)>10 : _title = t #set the longest string as description of deliverable if len(_description)<len(t): _description = t if _title == _description: _description = "" #if chosen title is not valid try to find better in parent entry if _title and not self._check_title(_title) and entry != False: _title = self._repair_title(entry) #create object if _link: pub = RRSPublication(title=_title,abstract=_description) typ = RRSPublication_type(type='techreport') pub['type'] = typ self.__debug("*"*40) self.__debug("Title: "+_title) self.__debug("Description: "+_description) self.__debug("Link: "+_link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """Check if title contents only unwanted string with some tolerance return true if title is ok """ def _check_title(self,title,tolerance=10): for t in self._unwanted_titles: if (s.find(s.lower(title),s.lower(t))) != -1: if (len(t)+tolerance) > len(title): return False return True "looks for an element with highest visibility rank in parent elemet" def _repair_title(self,entry): parent = entry.getparent() visibility = 0 title = "" for i in parent.iter(): try: if i.attrib.get('visibility') > visibility: visibility = i.attrib.get('visibility') title = i.text except AttributeError: pass if title != "": return title else: return False "Function try to create array of deliverables from one entry in xml tree" def _more_entry_in_record(self,entry): for ch in entry.iter('chunk'): if ch.text != None and ch.attrib.get("link")!=None: if self.agent.is_wanted_mime(ch.attrib.get("link")): _pub= RRSPublication(title=ch.text) typ = RRSPublication_type(type='techreport') _pub['type'] = typ _l = RRSUrl(link=ch.attrib.get("link")) _rel = RRSRelationshipPublicationUrl() _rel.set_entity(_l) _pub['url'] = _rel self._entriesFoundInLinks.append(_pub) "Process pages definied by urls" def process_pages(self,pages): self._entriesFoundInText = [] self._entriesFoundInLinks = [] self._urls = pages self._pages = self._crawler.start(pages) #creates RRSPublication objects with information about deliverables for u in self._urls: self._wraper.wrap(self._pages[u],u) self._tree = self._wraper.get_etree() #print self._wraper.get_xml() for entry in self._tree.iter("entry"): self._make_deliv_record(entry) if len(self._entriesFoundInText)>3: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks)>3: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks else: self._manual_processing() "This method is called when ther was no records found in output of sequencewrapper" def _manual_processing(self): self._entriesFoundInLinks = [] self._entriesFoundInText = [] self._manual_process_page(self._urls, urlsplit(self._urls[0])[1]) if len(self._entriesFoundInText)>0: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks)>0: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks ########################### TABLE HANDLING METHODS ############################ """ Get texts from element and his descendants. If string isset, returns texts as one string with spaces. # elem: lxml element """ def _get_descendats_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get link from row of table - go through columns and the only href leading to deliverable is returned. """ def _get_row_link(self, row): # find all anchors where parent is row linkanch = row.findall('.//a[@href]') if len(linkanch) == 0: return None for link in linkanch: anchor_link = link.get('href') if self.agent.is_wanted_mime(anchor_link): # check if it is file we want return anchor_link return None """ Handle region as a table. Work with region as it's a table. Try to get table semantic (table order) and get all records out of it. """ def _handle_table(self): for row in self.parentetree: if not row.tag == 'tr': continue row_list = [] _thislink = self._get_row_link(row) if _thislink == None: continue for column in row: text = self._get_descendats_texts(column) if not text: continue row_list.append(text) res = self._deliv_in_text(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") del(row_list) return ######################## TAG SEQUENCE RECOGNIZING METHODS #################### """ Tag check. If it is anchor with href leading to deliverable, returns True """ def _is_deliv_anch(self, tag): if tag.tag == 'a': href = tag.get('href') if self.agent.is_wanted_mime(href): return True return False """ Filters useless and messy tags. Return false if useless, true if normal tag """ def _tagfilter(self, tag): if tag.tag in self._omitted_tags: return False #if tag.text: # if not re.search("[a-z0-9\[\]]", tag.text, re.I): # return False return True """ Gets difference between first two anchors. """ def _getdiff(self, reg, tol): # etree reg = element tree region # int tol: accepted tolerance of tags d = {} index = 0 # fill the dictionary with differences and their occurences for tag in reg.iter(): if not self._tagfilter(tag): continue if self._is_deliv_anch(tag) and not index == 0: try: d[index] += 1 except: d[index] = 1 index = 0 index += 1 # check differencies if the variety isn't higher then $tol tolerance difflist = d.keys() self.__debug("difflist: "+str(difflist)) if len(difflist) == 0: return -1 _max = max(difflist) _min = min(difflist) dlen = len(d.keys()) if dlen == 1: return d.keys()[0] if dlen > ((2*tol)+1): # tolerance to both sides return -1 if (_max - _min) > 2*tol: # some acceptable tolerance return -1 # get the most frequent difference most_freq = max(d.values()) for key in d: if d[key] == most_freq: return key return -1 """ Only anchors found. No optional information. """ def _get_anch_only(self): anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree) # We have to make list of list because XMLOutput return [[anch] for anch in anchlist] """ Main method handling tag sequences and recognizing records. Returns list of records. """ def _get_tag_sequences(self, tag_tol=1): records = [] self._rec = [] if len(self.parentetree) == 0: return [[self.parentetree]] # get interval between anchors, use tolerance tag_tol self.difference = self._getdiff(self.parentetree, self.tagtol) while self.difference == -1: if self.tagtol > 5: self.__verbose("Variety of intervals between anchors is too huge. "+\ "Getting data out of anchors only") return self._get_anch_only() self.tagtol += 1 self.difference = self._getdiff(self.parentetree, self.tagtol) # get sequence of first n tags, where n is average interval between anchors # this could be tag-sequence describing all records in region. self.record_seq = [] i = 0 for tag in self.parentetree.iter(): if not self._tagfilter(tag): continue if i >= self.difference: if not 'a' in self.record_seq: del self.record_seq[0] else: break self.record_seq.append(tag.tag) i += 1 # counter indicates on which position in tag sequence we actually are counter = 0 # make sequence of tags as they go regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()]) recseqlen = len(self.record_seq) reglistlen = len(regionlist) # flag indicating begin of records - in region on the beginning can be some garbage self.begin = False # indicating unpredictable separator between deliverable records self.separator = 0 for i, tag in enumerate(regionlist): # skip and save the sequence at the end if counter > self.difference-1: records.append(self._rec) # save self._rec = [] # erase the list counter = 0 # reset counter if not self.begin: if tag.tag != self.record_seq[0]: continue else: try: if regionlist[i+1].tag != self.record_seq[1]: continue except: pass self.begin = True # handle tolerances, try to compare sibling tags self.match = False # match flag # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence. for tol in range(self.tagtol+1): if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \ regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter += tol+1 break elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \ regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter -= tol counter += 1 break # if nothing matched, its probably out of tolerance if not self.match: self.separator += 1 # tolerance 10 separators (tags between boxes or tables of deliverables) if self.separator > 10: self.__verbose("Tag sequence doesnt match, probably out of "+\ "tolerance, getting data out of anchors only") # maybe here could be tolerance++ # we didnt catch the sequence with tolerance... return self._get_anch_only() records.append(self._rec) return filter(self._validseq, records) """ Helper method - check if sequence of tags rec contains deliv anchor """ def _validseq(self, rec): for _atr in rec: # if we have anchor containing link to document, return true if self._is_deliv_anch(_atr): return True return False """ Get element texts only, dont look for descendants texts """ def _get_tag_content(self, tag): links = [] texts = [] if tag.tag == 'a': href = tag.get('href') # if link leading to document found, add string to list if href is not None and self.agent.is_wanted_mime(href): links.append(self.formatter.format(href)) title = tag.get('title') # if title found in tag, add string to list if title: texts.append(self.formatter.format(title)) # if not anchor, search text in tag.text if tag.text: if re.search("[a-z0-9]", tag.text, re.I): texts.append(self.formatter.format(tag.text)) return [links,texts] """ Harvest texts out of tags and return list of lists (record) """ def _harvest_text(self, record_tag_list): self._records = [] self._rec = [] _links = [] _texts = [] # loop over records and search all possible useful texts for rec_list in record_tag_list: for tag in rec_list: harvested = (self._get_tag_content(tag)) _links.extend(harvested[0]) _texts.extend(harvested[1]) #self._records.append(self._rec) res = self._deliv_in_text(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") _links = [] _texts = [] self._rec = [] return self._records """ Text harvesting for sequences. """ def _handle_sequence(self): seq = self._get_tag_sequences() return self._harvest_text(seq) """ Get records from region according document links this method is used when there was no records found in output of sequencewrapper""" def _manual_process_page(self, links, baseurl): _err = None self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree self.__debug(_err) continue #make all links absolute in parent tree hrefs = self.parentetree.findall('.//a[@href]') for href in hrefs: href.make_links_absolute('http://'+urlsplit(link)[1]+'/') # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset(self.regionHandler.formatter.get_charset()) self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40) self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table','tbody'): self.__verbose("Handling table") self._handle_table() else: self.__verbose("Handling sequences") self._handle_sequence() #############PUBLIC METHODS TO GET RESULTS def get_deliverables_XML(self): """return infromations about deliverables stored in objects as xml""" if len(self.get_deliverables())==0: return derrno.__err__(derrno.ENOREC) output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(self.get_deliverables()) result = output.getvalue() output.close() return result def get_deliverables(self): """return objects containing infromations""" if len(self._records) == 0: return derrno.__err__(derrno.ENOREC) else: return self._records
class GetDelivPage: def __init__(self, url, verbose=False, debug=False, addkeyw=None): # keywords used for document page search self._sigwords = ["d((eliverables?)|[0-9])", "documents?", "reports?", "public(ation)?s?", "results?", "presentations?", "library", #"projects?", "outocomes?", "downloads?", "outputs?"] if addkeyw != None: self._sigwords.append(addkeyw) """ Associative array containing links with their flags { url : [Index/NoIndex/Frame, Visit/Visited, Rank] } index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """ self._link_stack = { url : [0,0,0] } self.base_url = url # save base (input) url # Open an parsing agent to get needed data from page self.agent = GetHTMLAndParse() self._current_url = url # a constant used to set rank in order of importance of the expression # being tested (self._sigwords) self.rank_const = len(self._sigwords) # few a constants for dictionary - just for good-looking source code self.IND_FR = 0 # index/noindex/frame/special self.VISIT = 1 # unvisited/visited self.RANK = 2 # value of rank # set verbose flag self.__verbose__ = verbose #set debug flag self.__dbg__ = debug # checking data types if not type(self.__verbose__) == bool: raise ValueError("Verbose flag has to be boolean.") def __verbose(self, msg): _err = "cannot decode verbose message." if self.__verbose__ == True: try: print(str(msg)) except UnicodeError: print(_err) def __debug(self, msg): _err = "cannot decode debug info." if self.__dbg__ == True: try: print("Debug message: "+str(msg)) except UnicodeError: print(_err) ################################################################################ """ Initialize item in dictionary to noindex/unvisited/rank=0 """ def _link_item_init__(self, link, index=1, visit=0, rank=0): # default setting: noindex,unvisited,norank if not self._link_stack.has_key(link): self._link_stack[link] = [index,visit,rank] return """ Edits item in dictionary self._link_stack """ def _link_item_edit(self, link, index=None, visit=None, rank=None): if index is not None: self._link_stack[link][self.IND_FR] = index if visit is not None: self._link_stack[link][self.VISIT] = visit if rank is not None: # null rank if zero is argument if rank == 0: self._link_stack[link][self.RANK] = 0 # add rank else: self._link_stack[link][self.RANK] += rank return """ Method representing one level of cascade. Do almost any job to search one word in dictionary """ def _level_job(self, index=None): # get list of links from anchors containing one of expression # from self_sigwords result = 0 if index is not None: # searching with one link_list = self.agent.get_all_links( regul = re.compile(self._sigwords[index], re.I), base = self._current_url) else: link_list = self.agent.get_all_links(base = self._current_url) index = self.rank_const if link_list: # # RANK giving & filter # if index is None: rank = 0 elif index == 0: rank = self.rank_const * 2 else: rank = self.rank_const - index for link in link_list: # GTFO javascript if not link or "javascript:" in link or "mailto:" in link: continue if "#" in link: # if pointer delete it link = re.sub('#.*$', '', link) if len(link) > 200: continue if self._link_stack.get(link): # RANK if you see those links for first if self._link_stack[link][self.VISIT] == 0: self._link_item_edit(self._current_url, rank=rank) continue if not self.agent.compare_domains(self.base_url, link): continue split_link = re.sub("https?://.+?/", "", link) # check whether it is file or not if self.agent.is_wanted_mime(link): # # Some PDF or DOC found # # RANK self._link_item_edit(self._current_url, rank=10) self.__debug("Added rank 10 to "+self._current_url) # if re.search("de?l?(iverable)?[0-9]+([\._-][0-9])?", split_link, re.I): self.__debug("Type D on "+self._current_url) # debug print # RANK self._link_item_edit(self._current_url, rank=100) continue elif not self.agent.is_page(link): continue self.__debug("UNWATED") # # Add link # # RANK # initialization of link item in dict self._link_item_init__(link) self._link_item_edit(self._current_url, rank=rank) result += 1 # debug print self.__debug("ADD "+link[7:60]) self.__debug("Rank "+str(rank)+" "+self._current_url) return result """ Cascade search. May improve the speed of script """ def _cascade_search(self): result = 0 # first cascade - look for links cont. deliverables result += self._level_job(0) if not result == 0: return # second cascade - look for links cont. documents and publications result += self._level_job(1) result += self._level_job(2) if not result == 0: return # last cascade - all the rest for i in range(3,self.rank_const): result += self._level_job(i) # check Intro page (all links) only on index if result == 0 and self._link_stack[self._current_url][0] == 0: result += self._level_job() """if result == 0: # RANK DOWN self._link_item_edit(self._current_url, rank=0) print "No anchors on the page""" return """ TRY TO repair link. But for now only append / in base """ def _repair_links(self, base=None): if base is None: base = self.base_url if re.match(".*[^/]$", base): base += "/" if self.agent.get_etree() == -1: return -1 links = self.agent.get_all_links(base = base) # compare link with base url for link in links: if not self.agent.compare_domains(self.base_url, link): continue link = re.sub("https?://.+?/", base, link) # if match, save it as special case self._link_item_init__(link, index=3) """ Checking intro page. It is page without content, only with Enter label """ def _check_intro(self): links = self.agent.get_all_links(base = self._current_url) self.__debug("We've found intro links: "+str(links)) for link in links: if not self.agent.compare_domains(self.base_url, link): continue # save new link as normal page self._link_item_init__(link, index=1) """ Looks for frames on the page """ def _check_frames(self): frames = self.agent.look_for_frame(base = self._current_url) if not frames: return None fcount = len(frames) # debug print self.__debug("We've found frames ("+str(fcount)+") on "+self._current_url) # save new link as frame page for link in frames: if self.agent.compare_domains(self._current_url, link): self._link_item_init__(link, index=2) return fcount """ Checks for titles and gives rank according the result """ def _check_titles(self): for i in range(self.rank_const): hcount = self.agent.count_all_headers( re.compile( self._sigwords[i], re.I )) if not hcount == 0: if i == 0: # # "deliverable" match, the highest rank # # RANK constant is multiplied by 4 self.__debug("deliverable match"+str(self.rank_const * 4)+" "+self._current_url) self._link_item_edit(self._current_url, rank = self.rank_const * 4) else: # # other word match # # do not multiplied rank constant self.__debug("Rank "+str(self.rank_const - i)+" "+self._current_url) self._link_item_edit(self._current_url, rank = self.rank_const - i) """ Get information about current link """ def _check_anchor(self): # tt is Text and Title tt = self.agent.get_anchor_from_link(self._current_url) # return 0 if no anchor match if tt == 0: return tt; # match for deliverables if re.search(self._sigwords[0], tt, re.I): self.__debug("Anchor matched "+self._current_url) # debug print return 1 """ Returns list of unvisited links. Useful in cycle. """ def _check_unvisited_links(self): unvisitedLinks = [] for link in self._link_stack: if self._link_stack[link][self.VISIT] == 0: # if unvisited unvisitedLinks.append(link) return unvisitedLinks # list of unvisited page links """ Aplying all methods to unvisited links - next level of searching. It is main private method. Only this method can decide end of searching """ def _handle_unvis_links(self): unvisLinks = self._check_unvisited_links() if not unvisLinks: return None # end of searching for link in unvisLinks: # cycle in unvisited links # visit and parse page self._link_item_edit(link, visit = 1) (res, err) = self.agent.ghap(link) if res == -1: self.__debug(str(err)+" "+str(link)) # debug print # if link is broken (IND_FR == 3) if self._link_stack[link][self.IND_FR] != 3: self._repair_links() continue # little hack with error message, there is no error but URL! if res == 2: self.base_url = err # URL of the new base self.__debug("Getting url in ghap(): "+str(link)) # debug print self.__verbose("Searching... URL: "+str(link)) # verbose print self._current_url = link if self._link_stack[link][self.IND_FR] == 2: dname = self.agent.get_domain_name(link) if dname is not None: self.base_url = dname ############### # frame check # self._check_frames() ################ # titles check # self._check_titles() # rank giving here ################ # anchor check # if self._check_anchor(): self._link_item_edit(link, rank = 10) # rank giving here too self._cascade_search() # search for next links on this page # when no unvisited links in list, return return 1 """ Returns link of the highest value of rank in self._link_stack. It is called in the end of process.""" def _get_highest_ranks_link(self): hRank = 0 hLink = "" # check all links and choose link with the highest rank for link in self._link_stack: if self._link_stack[link][self.RANK] > hRank: hLink = link hRank = self._link_stack[link][self.RANK] return hLink # WINNER """ Returns list of all links leading to deliverables. Try to find more sites with deliverables.. i.e. like www.awissenet.com has. Maybe test for name of link - anchor: i.e. next, prev, [0-9]+ and so one... Page usualy looks like: next pages: 1 2 3 4 ... """ def _get_deliv_link_list(self,first_link): # agent gets first_link final_list = [] nonvisited = [first_link] current = nonvisited.pop() while current: if not current or "javascript:" in current or "mailto:" in current: try: current = nonvisited.pop() except: break continue if self.agent.ghap(current)[0] == -1: # CACHE ??? maybe try: current = nonvisited.pop() except: break continue nonvisited.extend(self.agent.get_pager_links(base=current)) final_list.append(current) # append only one link try: current = nonvisited.pop() except: break return final_list # returning all pages with deliverables """ Returns list of links on pages with deliverable-documents. If found returns list, if not found, return -1. Only public method in module. """ def get_deliverable_page(self): # the main searching loop # while we have some unvisited links, search while self._handle_unvis_links(): # security case if len(self._link_stack) > 10: break self.__debug("Stack content: "+str(self._link_stack)) if len(self._link_stack) == 1 : return derrno.__err__(derrno.ELNOTFOUND) final_link = self._get_highest_ranks_link() if not final_link or self._link_stack[final_link][2] == 0: return derrno.__err__(derrno.ELNOTFOUND) self.__debug('#'*79) self.__debug("DELIVERABLE PAGE: "+final_link) return [final_link] ####### not in use ############# result = self._get_deliv_link_list(final_link) if len(result) == 0: return derrno.__err__(derrno.ELNOTFOUND) else: return result