def compareDocs(driver1, driver2): #global div_compare_mem; src1 = removeCommentsAndJS(driver1.page_source) tree1 = lxml.html.fromstring(src1) etree1 = etree.ElementTree(tree1) src2 = removeCommentsAndJS(driver2.page_source) tree2 = lxml.html.fromstring(src2) etree2 = etree.ElementTree(tree2) print "Built both trees\n" # drop all memorized results div_compare_mem = {} print "Looking for subtrees of first doc in another\n" calcRelationships(div_compare_mem, tree1, etree1, tree2, etree2) print "Done.\n" xpaths_to_hide1 = [] for (x, y) in div_compare_mem: if div_compare_mem[(x, y)] > 0: xpaths_to_hide1.append(x) dimOpacityFromXpath(driver1, xpaths_to_hide1, 0.2) div_compare_mem_rev = {} for (x, y) in div_compare_mem: if div_compare_mem[(x, y)] == 2: div_compare_mem_rev[(y, x)] = 0 elif div_compare_mem[(x, y)] == 1: div_compare_mem_rev[(y, x)] = 1 #elif div_compare_mem[(x,y)] == 0: # div_compare_mem_rev[(y,x)] = 0 #div_compare_mem = div_compare_mem_rev print "Looking for subtrees of second doc in first\n" calcRelationships(div_compare_mem_rev, tree2, etree2, tree1, etree1) print "Done.\n" xpaths_to_hide2 = [] for (x, y) in div_compare_mem_rev: if div_compare_mem_rev[(x, y)] > 0: xpaths_to_hide2.append(x) print "Setting Opacities\n" dimOpacityFromXpath(driver2, xpaths_to_hide2, 0.2) return (xpaths_to_hide1, xpaths_to_hide2)
def buildTreeFromSource(self, source): if not source: return #self.source = source self.source = removeCommentsAndJS(removeHeads(source)) self.tree1 = lxml.html.fromstring(self.source) self.etree1 = etree.ElementTree(self.tree1) return
def compareOuterHtml(self, outer_html1, outer_html2): outer_html1 = removeCommentsAndJS(outer_html1).strip() outer_html2 = removeCommentsAndJS(outer_html2).strip() if not outer_html1: return 0 if not outer_html2: return 0 tree1 = lxml.html.fromstring(outer_html1) etree1 = etree.ElementTree(tree1) tree2 = lxml.html.fromstring(outer_html2) etree2 = etree.ElementTree(tree2) if tree1.tag != tree2.tag: return 0 div_compare_mem = {} result = 0 if tree1.tag == 'option': result = self.compareOptionTag(tree1, etree1, tree2, etree2) elif tree1.tag == 'img': result = self.compareImgTag(tree1, etree1, tree2, etree2) elif tree1.tag == 'a': result = self.compareATag(tree1, etree1, tree2, etree2) elif tree1.tag == 'li': result = self.compareLITag(div_compare_mem, tree1, etree1, tree2, etree2) elif tree1.tag == 'ul': result = self.compareULTag(div_compare_mem, tree1, etree1, tree2, etree2) elif tree1.tag == 'div': result = self.compareElements(div_compare_mem, tree1, etree1, tree2, etree2) return result
def export_schedule(self, out_file=None): e_html = etree.Element('html') e_head = etree.SubElement(e_html, 'head') e_encoding = etree.SubElement(e_head, 'meta', charset="utf-8") if self.options.get('html_title', False): title = self.options['html_title'] else: title = self.schedule.name e_title = etree.SubElement(e_head, 'title') e_title.text = title e_style = etree.SubElement(e_head, 'style', type='text/css') e_style.text = css e_body = etree.SubElement(e_html, 'body') e_h1 = etree.SubElement(e_body, 'h1') e_h1.text = title if self.options.get('html_table_header', False): e_body.append(etree.fromstring(self.options['html_table_header'])) e_table = etree.SubElement(e_body, 'table', attrib={ 'align': 'center', 'class': 'schedule' }) e_tr_head = etree.SubElement(e_table, 'tr') head_columns = ['HierarchIndex', 'Name', 'Start', 'End', 'Duration'] for column in head_columns: e_th_head = etree.SubElement(e_tr_head, 'th') e_th_head.text = column for index, task in enumerate(self.schedule.tasks): self._export_task(e_table, task, index + 1) etree_return = etree.ElementTree(e_html) if out_file: etree_return.write(out_file, pretty_print=True, encoding="utf-8", xml_declaration=False) return str(etree_return)
def analyze_home_page(brand): driver = webdriver.Firefox() url = brand.start_url print "Fetching the start url %s for brand %s " % (url, brand.name) driver.get(url) src = driver.page_source tree = lxml.html.fromstring(src) etree1 = etree.ElementTree(tree) root = etree1.getroot() analyze_page(driver) # after this, we should find all category pages # then crawl these category pages and fine the content of the sub-navigation # visit each one of these and call analyze_page() on all of them # TODO 1: handle iframes # TODO 2: handle text that contains keywords driver.quit()
def findReviewUsingLiguisticHints(self, product_div): driver = self.reader.getDriver() product_div_area = product_div.size['width'] * product_div.size[ 'height'] product_div_x1 = product_div.location['x'] product_div_x2 = product_div_x1 + product_div.size['width'] product_div_y1 = product_div.location['y'] product_div_y2 = product_div_y1 + product_div.size['height'] html = driver.page_source html = removeCommentsAndJS(html) tree1 = lxml.html.fromstring(driver.page_source) etree1 = etree.ElementTree(tree1) review_divs = [] reviewXPaths = [] reviewtextslen = [] childList = [] childList.append(tree1) i = 0 first_largest_len = 0 second_largest_len = 0 f_xpath = '' s_xpath = '' while i < len(childList): nextChild = childList[i] nextChildsChildren = list(nextChild) if nextChildsChildren: for ncc in nextChildsChildren: childList.append(ncc) else: text = nextChild.text_content() if text: text = text.encode("ascii", "ignore") text = text.lower() spaceCount = text.count(' ') if spaceCount >= 3: text = text.replace('.', ' ') text = text.replace('$', ' ') text = text.replace('!', ' ') text = text.replace(',', ' ') text = text.replace('?', ' ') text = text.replace('\n', ' ') hasreview = False if not hasreview: hasreview = " i " in text if not hasreview: hasreview = " me " in text if not hasreview: hasreview = " my " in text if not hasreview: hasreview = " myself " in text if not hasreview: hasreview = " we " in text if hasreview: xpath = etree1.getpath(nextChild) reviewXPaths.append([xpath, len(text)]) # keep track of xpaths of first 2 largest review elements if len(text) >= first_largest_len: second_largest_len = first_largest_len first_largest_len = len(text) s_xpath = f_xpath f_xpath = xpath elif second_largest_len < len(text) and len( text) < first_largest_len: second_largest_len = len(text) s_xpath = xpath i = i + 1 #print reviewXPaths maxid = 0 maxclass = 0 maxid_name = '' maxclass_name = '' idMap = {} for (r, l) in reviewXPaths: rid = r + "/@id" ids = etree1.xpath(rid) for id in ids: if id in idMap: idMap[id] = idMap[id] + 1 else: idMap[id] = 1 for e in idMap: if idMap[e] > maxid: maxid = idMap[e] maxid_name = e classMap = {} for (r, l) in reviewXPaths: rclass = r + "/@class" classes = etree1.xpath(rclass) for aclass in classes: if aclass in classMap: classMap[aclass] = classMap[aclass] + 1 else: classMap[aclass] = 1 for e in classMap: if classMap[e] > maxclass: maxclass = classMap[e] maxclass_name = e #if maxid == 0 and maxclass == 0 # return [] haha_i_found_yous = [] if maxid > maxclass and maxid_name: haha_i_found_yous = driver.find_elements_by_id(maxid_name) elif maxid < maxclass and maxclass_name: haha_i_found_yous = driver.find_elements_by_class_name( maxclass_name) else: return [] # couldnt find anything if not haha_i_found_yous: return [] # couldnt find anything below product img # filter elements above or at product img level elements_below_product_div = [] for e in haha_i_found_yous: y1 = e.location['y'] y2 = e.location['y'] + e.size['height'] if not (y2 > product_div_y2): continue else: elements_below_product_div.append(e) haha_i_found_yous = elements_below_product_div if not haha_i_found_yous: return [] # couldnt find anything below product img # now recurse up to find the nearest encompassing rectangle # x = [] y = [] for e in haha_i_found_yous: x.append(e.location['x']) x.append(e.location['x'] + e.size['width']) y.append(e.location['y']) y.append(e.location['y'] + e.size['height']) x.sort() y.sort() x1 = x[0] x2 = x[len(x) - 1] y1 = y[0] y2 = y[len(y) - 1] review_divs = [] crappy_element = None pEle = haha_i_found_yous[0] while True: px1 = pEle.location['x'] px2 = pEle.location['x'] + pEle.size['width'] py1 = pEle.location['y'] py2 = pEle.location['y'] + pEle.size['height'] print px1, px2, py1, py2 # if the element-being-considered is not location below product image,then break if not (py2 > product_div_y2): break # does rectangle given by x1, x2 ,y1, y2 fall within px1, px2, py1, py2 if px1 <= x1 and x2 <= px2 and py1 <= y1 and y2 <= py2: crappy_element = pEle break tag = pEle.get_attribute('tag') if tag: if tag.lower() == 'body': break re = self.reader.getParentOfWebElement(pEle) if re[0] == 1 and re[1] != None: pEle = re[1] else: break if crappy_element is not None: review_divs.append(crappy_element) else: for an_element in haha_i_found_yous: review_divs.append(an_element) # if f_xpath and s_xpath: # # i = 0 # xpath1_len = len(f_xpath) # xpath2_len = len(s_xpath) # # while i < xpath1_len and i < xpath2_len: # if f_xpath[i] != s_xpath[i]: # xpath = f_xpath[:i] # break # i = i+1 # # if xpath: # reviewXPaths = [xpath] # i = 0 # review_divs = [] # while i < len(reviewXPaths): # rd = reader.getElementsByXPath(reviewXPaths[i]) # if rd: # review_divs = review_divs + rd # i = i +1 # print reviewXPaths return review_divs