示例#1
0
def calcTwoHTMLDistance(contents1, contents2):
    try:
        soup1 = BeautifulSoup(contents1, "html5lib")
    except Exception as e:
        print "Error parsing DOM using html5 ", str(e)
        soup1 = BeautifulSoup(contents1.decode('utf-8'), "html5lib")

    try:
        soup2 = BeautifulSoup(contents2, "html5lib")
    except Exception as e:
        print "Error parsing DOM using html5 ", str(e)
        soup2 = BeautifulSoup(contents2.decode('utf-8'), "html5lib")

    node1 = Node("doc")
    node2 = Node("doc")
    traverseDOMTree(soup1.html, node1, 0)
    traverseDOMTree(soup2.html, node2, 0)
    ld1, ld1_script_hosts, ld1_script_contents = getLDPairRepr(node1)
    ld2, ld2_script_hosts, ld2_script_contents = getLDPairRepr(node2)
    print "script length for ld1: %d %d " % (len(ld1_script_hosts),
                                             len(ld1_script_contents))
    print "script length for ld2: %d %d " % (len(ld2_script_hosts),
                                             len(ld2_script_contents))
    D = mmdiff(ld1, ld2)
    return mmdiffR(ld1, ld2, D, \
      ld1_script_hosts,ld1_script_contents, ld2_script_hosts, ld2_script_contents)
示例#2
0
def extractScriptFromContents(contents):
    if contents == None or len(contents) == 0:
        return None, None
    try:
        soup = BeautifulSoup(contents, "html5lib")
    except Exception as e:
        print "Error parsing DOM using html5 ", str(e)
        soup = BeautifulSoup(contents.decode('utf-8'), "html5lib")
    node = Node("doc")
    traverseDOMTree(soup.html, node, 0)
    script_hosts, script_contents = extractScriptFromDOMTree(node)

    #for host in script_hosts:
    # print "host: %s" %host
    #for content in script_contents:
    # print "content: %s" %content
    #print "summary Host:%d Contents:%d" %(len(script_hosts), len(script_contents))

    return script_hosts, script_contents
示例#3
0
def extractScriptFromContents(contents):
  if contents == None or len(contents)==0:
    return None, None
  try:
    soup = BeautifulSoup(contents, "html5lib")
  except Exception as e:
    print "Error parsing DOM using html5 ",str(e)
    soup = BeautifulSoup(contents.decode('utf-8'), "html5lib")
  node = Node("doc")
  traverseDOMTree(soup.html,node, 0)
  script_hosts, script_contents = extractScriptFromDOMTree(node)
  
  #for host in script_hosts:
  # print "host: %s" %host
  #for content in script_contents:
  # print "content: %s" %content
  #print "summary Host:%d Contents:%d" %(len(script_hosts), len(script_contents))

  return script_hosts, script_contents
示例#4
0
def calcTwoHTMLDistance(contents1, contents2):
  try:
    soup1 = BeautifulSoup(contents1, "html5lib")
  except Exception as e:
    print "Error parsing DOM using html5 ",str(e)
    soup1 = BeautifulSoup(contents1.decode('utf-8'), "html5lib")
  
  try:
    soup2 = BeautifulSoup(contents2, "html5lib")
  except Exception as e:
    print "Error parsing DOM using html5 ", str(e)
    soup2 = BeautifulSoup(contents2.decode('utf-8'), "html5lib")

  node1 = Node("doc")
  node2 = Node("doc")
  traverseDOMTree(soup1.html,node1, 0)
  traverseDOMTree(soup2.html,node2, 0)
  ld1, ld1_script_hosts, ld1_script_contents = getLDPairRepr(node1)
  ld2, ld2_script_hosts, ld2_script_contents = getLDPairRepr(node2)
  print "script length for ld1: %d %d " % (len(ld1_script_hosts),len(ld1_script_contents))
  print "script length for ld2: %d %d " % (len(ld2_script_hosts),len(ld2_script_contents))
  D = mmdiff(ld1, ld2)
  return mmdiffR(ld1, ld2, D, \
    ld1_script_hosts,ld1_script_contents, ld2_script_hosts, ld2_script_contents)