示例#1
0
def test_dom_simi():
	doc1 = minidom.parse_xml_to_document(_html1)	
	doc2 = minidom.parse_xml_to_document(_html2)	

	#domsimi.stm(doc1.documentElement,doc2.documentElement)	
	#print domsimi.nstm(doc1.documentElement,doc2.documentElement)
	print domsimi.compute_simi(doc1.documentElement,doc2.documentElement)
def get_list_candidate_nodes(doc):
	'''
		find all the list candidate nodes in the web page dom tree
		'''
	list_candidate_nodes = []

	#dfs walk web page dom tree
	for next in minidom.postorder_dfs_walk_iterator(doc.documentElement):
		list_item_candidate_nodes = []

		for child in minidom.element_child_iterator(next):
			if len(list_item_candidate_nodes) == 0:
				list_item_candidate_nodes.append(child)
			else:
				#compute similarity with siblings
				last = list_item_candidate_nodes[len(list_item_candidate_nodes) - 1]
				simi_score = domsimi.compute_simi(last, child)

				#judge if it's a listitem candidate
				if simi_score > 0.8 :
					list_item_candidate_nodes.append(child)

		#judge if it's a list candidate	
		if len(list_item_candidate_nodes) > 4:
			list_candidate_node_info = {"list":next,"items":list_item_candidate_nodes}
			list_candidate_nodes.append(list_candidate_node_info)
	
	return list_candidate_nodes