Пример #1
0
def compareDocs(driver1, driver2):

    #global div_compare_mem;

    src1 = removeCommentsAndJS(driver1.page_source)
    tree1 = lxml.html.fromstring(src1)
    etree1 = etree.ElementTree(tree1)

    src2 = removeCommentsAndJS(driver2.page_source)
    tree2 = lxml.html.fromstring(src2)
    etree2 = etree.ElementTree(tree2)

    print "Built both trees\n"

    # drop all memorized results
    div_compare_mem = {}
    print "Looking for subtrees of first doc in another\n"
    calcRelationships(div_compare_mem, tree1, etree1, tree2, etree2)
    print "Done.\n"
    xpaths_to_hide1 = []
    for (x, y) in div_compare_mem:
        if div_compare_mem[(x, y)] > 0:
            xpaths_to_hide1.append(x)

    dimOpacityFromXpath(driver1, xpaths_to_hide1, 0.2)

    div_compare_mem_rev = {}
    for (x, y) in div_compare_mem:
        if div_compare_mem[(x, y)] == 2:
            div_compare_mem_rev[(y, x)] = 0

        elif div_compare_mem[(x, y)] == 1:
            div_compare_mem_rev[(y, x)] = 1

        #elif div_compare_mem[(x,y)] == 0:
        #	div_compare_mem_rev[(y,x)] = 0

    #div_compare_mem = div_compare_mem_rev
    print "Looking for subtrees of second doc in first\n"
    calcRelationships(div_compare_mem_rev, tree2, etree2, tree1, etree1)
    print "Done.\n"
    xpaths_to_hide2 = []
    for (x, y) in div_compare_mem_rev:
        if div_compare_mem_rev[(x, y)] > 0:
            xpaths_to_hide2.append(x)

    print "Setting Opacities\n"
    dimOpacityFromXpath(driver2, xpaths_to_hide2, 0.2)

    return (xpaths_to_hide1, xpaths_to_hide2)
Пример #2
0
    def buildTreeFromSource(self, source):

        if not source:
            return
        #self.source = source
        self.source = removeCommentsAndJS(removeHeads(source))
        self.tree1 = lxml.html.fromstring(self.source)
        self.etree1 = etree.ElementTree(self.tree1)

        return
Пример #3
0
    def compareOuterHtml(self, outer_html1, outer_html2):

        outer_html1 = removeCommentsAndJS(outer_html1).strip()
        outer_html2 = removeCommentsAndJS(outer_html2).strip()

        if not outer_html1:
            return 0
        if not outer_html2:
            return 0

        tree1 = lxml.html.fromstring(outer_html1)
        etree1 = etree.ElementTree(tree1)
        tree2 = lxml.html.fromstring(outer_html2)
        etree2 = etree.ElementTree(tree2)

        if tree1.tag != tree2.tag:
            return 0

        div_compare_mem = {}

        result = 0
        if tree1.tag == 'option':
            result = self.compareOptionTag(tree1, etree1, tree2, etree2)
        elif tree1.tag == 'img':
            result = self.compareImgTag(tree1, etree1, tree2, etree2)
        elif tree1.tag == 'a':
            result = self.compareATag(tree1, etree1, tree2, etree2)
        elif tree1.tag == 'li':
            result = self.compareLITag(div_compare_mem, tree1, etree1, tree2,
                                       etree2)
        elif tree1.tag == 'ul':
            result = self.compareULTag(div_compare_mem, tree1, etree1, tree2,
                                       etree2)
        elif tree1.tag == 'div':
            result = self.compareElements(div_compare_mem, tree1, etree1,
                                          tree2, etree2)
        return result
Пример #4
0
    def export_schedule(self, out_file=None):
        e_html = etree.Element('html')
        e_head = etree.SubElement(e_html, 'head')

        e_encoding = etree.SubElement(e_head, 'meta', charset="utf-8")

        if self.options.get('html_title', False):
            title = self.options['html_title']
        else:
            title = self.schedule.name

        e_title = etree.SubElement(e_head, 'title')
        e_title.text = title

        e_style = etree.SubElement(e_head, 'style', type='text/css')
        e_style.text = css

        e_body = etree.SubElement(e_html, 'body')

        e_h1 = etree.SubElement(e_body, 'h1')
        e_h1.text = title

        if self.options.get('html_table_header', False):
            e_body.append(etree.fromstring(self.options['html_table_header']))

        e_table = etree.SubElement(e_body,
                                   'table',
                                   attrib={
                                       'align': 'center',
                                       'class': 'schedule'
                                   })
        e_tr_head = etree.SubElement(e_table, 'tr')
        head_columns = ['HierarchIndex', 'Name', 'Start', 'End', 'Duration']
        for column in head_columns:
            e_th_head = etree.SubElement(e_tr_head, 'th')
            e_th_head.text = column

        for index, task in enumerate(self.schedule.tasks):
            self._export_task(e_table, task, index + 1)

        etree_return = etree.ElementTree(e_html)
        if out_file:
            etree_return.write(out_file,
                               pretty_print=True,
                               encoding="utf-8",
                               xml_declaration=False)

        return str(etree_return)
Пример #5
0
def analyze_home_page(brand):

	driver = webdriver.Firefox()
	url = brand.start_url

	print "Fetching the start url %s for brand %s " % (url, brand.name)

	driver.get(url)
	src = driver.page_source
	tree = lxml.html.fromstring(src)
	etree1 = etree.ElementTree(tree)
	root = etree1.getroot()

	analyze_page(driver)

	# after this, we should find all category pages
	# then crawl these category pages and fine the content of the sub-navigation
	# visit each one of these and call analyze_page() on all of them

	# TODO 1: handle iframes
	# TODO 2: handle text that contains keywords

	driver.quit()
Пример #6
0
    def findReviewUsingLiguisticHints(self, product_div):

        driver = self.reader.getDriver()

        product_div_area = product_div.size['width'] * product_div.size[
            'height']
        product_div_x1 = product_div.location['x']
        product_div_x2 = product_div_x1 + product_div.size['width']
        product_div_y1 = product_div.location['y']
        product_div_y2 = product_div_y1 + product_div.size['height']

        html = driver.page_source
        html = removeCommentsAndJS(html)
        tree1 = lxml.html.fromstring(driver.page_source)
        etree1 = etree.ElementTree(tree1)

        review_divs = []
        reviewXPaths = []
        reviewtextslen = []
        childList = []
        childList.append(tree1)
        i = 0
        first_largest_len = 0
        second_largest_len = 0
        f_xpath = ''
        s_xpath = ''

        while i < len(childList):

            nextChild = childList[i]
            nextChildsChildren = list(nextChild)
            if nextChildsChildren:
                for ncc in nextChildsChildren:
                    childList.append(ncc)
            else:
                text = nextChild.text_content()
                if text:
                    text = text.encode("ascii", "ignore")
                    text = text.lower()
                    spaceCount = text.count(' ')
                    if spaceCount >= 3:
                        text = text.replace('.', ' ')
                        text = text.replace('$', ' ')
                        text = text.replace('!', ' ')
                        text = text.replace(',', ' ')
                        text = text.replace('?', ' ')
                        text = text.replace('\n', ' ')

                        hasreview = False
                        if not hasreview:
                            hasreview = " i " in text
                        if not hasreview:
                            hasreview = " me " in text
                        if not hasreview:
                            hasreview = " my " in text
                        if not hasreview:
                            hasreview = " myself " in text
                        if not hasreview:
                            hasreview = " we " in text

                        if hasreview:
                            xpath = etree1.getpath(nextChild)
                            reviewXPaths.append([xpath, len(text)])

                            # keep track of xpaths of first 2 largest review elements
                            if len(text) >= first_largest_len:
                                second_largest_len = first_largest_len
                                first_largest_len = len(text)
                                s_xpath = f_xpath
                                f_xpath = xpath

                            elif second_largest_len < len(text) and len(
                                    text) < first_largest_len:
                                second_largest_len = len(text)
                                s_xpath = xpath

            i = i + 1

        #print reviewXPaths

        maxid = 0
        maxclass = 0
        maxid_name = ''
        maxclass_name = ''

        idMap = {}
        for (r, l) in reviewXPaths:
            rid = r + "/@id"
            ids = etree1.xpath(rid)
            for id in ids:
                if id in idMap:
                    idMap[id] = idMap[id] + 1
                else:
                    idMap[id] = 1

        for e in idMap:
            if idMap[e] > maxid:
                maxid = idMap[e]
                maxid_name = e

        classMap = {}
        for (r, l) in reviewXPaths:
            rclass = r + "/@class"
            classes = etree1.xpath(rclass)
            for aclass in classes:
                if aclass in classMap:
                    classMap[aclass] = classMap[aclass] + 1
                else:
                    classMap[aclass] = 1

        for e in classMap:
            if classMap[e] > maxclass:
                maxclass = classMap[e]
                maxclass_name = e

        #if maxid == 0 and maxclass == 0
        #	return []

        haha_i_found_yous = []
        if maxid > maxclass and maxid_name:
            haha_i_found_yous = driver.find_elements_by_id(maxid_name)
        elif maxid < maxclass and maxclass_name:
            haha_i_found_yous = driver.find_elements_by_class_name(
                maxclass_name)
        else:
            return []  # couldnt find anything

        if not haha_i_found_yous:
            return []  # couldnt find anything below product img

        # filter elements above or at product img level
        elements_below_product_div = []
        for e in haha_i_found_yous:
            y1 = e.location['y']
            y2 = e.location['y'] + e.size['height']
            if not (y2 > product_div_y2):
                continue
            else:
                elements_below_product_div.append(e)

        haha_i_found_yous = elements_below_product_div

        if not haha_i_found_yous:
            return []  # couldnt find anything below product img

        # now recurse up to find the nearest encompassing rectangle
        #
        x = []
        y = []
        for e in haha_i_found_yous:
            x.append(e.location['x'])
            x.append(e.location['x'] + e.size['width'])
            y.append(e.location['y'])
            y.append(e.location['y'] + e.size['height'])

        x.sort()
        y.sort()
        x1 = x[0]
        x2 = x[len(x) - 1]
        y1 = y[0]
        y2 = y[len(y) - 1]

        review_divs = []
        crappy_element = None
        pEle = haha_i_found_yous[0]
        while True:
            px1 = pEle.location['x']
            px2 = pEle.location['x'] + pEle.size['width']
            py1 = pEle.location['y']
            py2 = pEle.location['y'] + pEle.size['height']

            print px1, px2, py1, py2

            # if the element-being-considered is not location below product image,then break
            if not (py2 > product_div_y2):
                break

            # does rectangle given by x1, x2 ,y1, y2 fall within px1, px2, py1, py2
            if px1 <= x1 and x2 <= px2 and py1 <= y1 and y2 <= py2:
                crappy_element = pEle
                break
            tag = pEle.get_attribute('tag')
            if tag:
                if tag.lower() == 'body':
                    break
            re = self.reader.getParentOfWebElement(pEle)
            if re[0] == 1 and re[1] != None:
                pEle = re[1]
            else:
                break

        if crappy_element is not None:
            review_divs.append(crappy_element)
        else:
            for an_element in haha_i_found_yous:
                review_divs.append(an_element)

#		if f_xpath and s_xpath:
#
#			i = 0
#			xpath1_len = len(f_xpath)
#			xpath2_len = len(s_xpath)
#
#			while i < xpath1_len and i < xpath2_len:
#				if f_xpath[i] != s_xpath[i]:
#					xpath = f_xpath[:i]
#					break
#				i = i+1
#
#			if xpath:
#				reviewXPaths = [xpath]

#		i = 0
#		review_divs = []
#		while i < len(reviewXPaths):
#			rd = reader.getElementsByXPath(reviewXPaths[i])
#			if rd:
#				review_divs = review_divs + rd
#			i = i +1
#
        print reviewXPaths

        return review_divs