def parse_file(file):
        tree = ET.parse(file)
        root = tree.getroot()
	mails = []
	space = re.compile(r' ',re.S)
	sp = re.compile(r'\-+',re.S)
	gt = re.compile(r'\&gt',re.S)
        for i in root:
		tmp = mail_struct(len(mails))
		for j in i:
			if(j.tag == 'name'):
				#print(space.sub('_',j.text))
				tmp.name = space.sub('_',j.text)
				tmp.remove_stop_ver_title = stop_word_remove(j.text)
				#print(j.text)
			elif(j.tag == 'DOC'):
				thread_num = -1
				for k in j:
					if k.tag == 'Subject':
						tmp.subject.append(k.text)
					if(k.tag == 'Text'):
						for l in k:
							res = gt.search(l.text)
							'''
							if(res is not None or l.text[0] == '>' or(l.text[0] =='_' and l.text[1] == '__')):
								continue
							if(l.text[0] == 'h' and l.text[1]=='t' and l.text[2] == 't' and l.text[3] == 'p'):
								continue
							'''
							tmp_text = l.text.replace('\n',' ')
							tmp_text = sp.sub('',tmp_text)
							if(tmp_text.isspace()):
								continue
							t_s = sentence_struct(tmp_text,l.attrib['id'])
							tmp.text.append(t_s)

							if t_s.index.split('.')[0] != thread_num:
								thread_num = t_s.index.split('.')[0]
								tmp.thread.append(thread_struct(int(thread_num)))
								#thread_num = t_s.index.split('.')[0]
							tmp.thread[-1].sentences.append(t_s)
							#print t_s.sentence
							#tmp.id.append(l.attrib['id'])
							#print(l.text)
		mails.append(tmp)
	for i in mails:
		for j in i.text:
			for k in range(len(j.remove_stop_ver)):
				if(len(j.remove_stop_ver[k])<2 or j.remove_stop_ver[k] == 'gt' or j.remove_stop_ver[k] == 'gtgt'):
					j.remove_stop_ver[k] = ''
			#print(j.remove_stop_ver)
			j.remove_stop_ver = (filter(None,j.remove_stop_ver))
	return mails
Пример #2
0
def process_document(title,sen2=1,exist=1):
	article,art_title = crawl_article(title)
        #sen2 = 1
        #exist = 1
        if(sen2 == 1):
                input_name = title+'_sen2_test.txt'
                feature_name = title+'_sen2_features.txt'
                output_name = title+'_sen2_output.txt'
                vec = construct_2sen_vec(article)
	else:
                input_name = title+'_test.txt'
                feature_name = title+'_features.txt'
                output_name = title+'_output.txt'
                vec = construct_sentence_vec(article)
	'''
	if(exist == 0):
                construct_input_file(vec,input_name,art_title)
                os.system('python takelab_sts/takelab_simple_features.py '+input_name+' > '+feature_name)
                os.system('svm-predict '+feature_name+' model.txt ' +  output_name)
                os.system('python postprocess_scores.py '+ input_name+' ' + output_name)
        matrix,title_sim = construct_similarity_matrix(output_name,len(vec))
	'''
	tmp = mail_struct()
	tmp.name = title
	tmp.text = vec
	bot = re.compile(r'\_',re.S)
	tmp.remove_stop_ver_title = stop_word_remove(bot.sub(' ',title))
	matrix,title_sim = cal_similarity(tmp)
        important =  lexrank(matrix,len(vec),0.001)
	for k in range(len(important)):
		if(math.isnan(important[k])):
			important[k] = 0
        important = (important-min(important))/(max(important)-min(important))
	title_sim = (title_sim-min(title_sim))/(max(title_sim)-min(title_sim))
        sort_index = numpy.argsort(important)[::-1]
	return vec,important,title_sim