Exemplo n.º 1
0
def get_page_total_detail(url):
	#
	my_log('now to get page detail....'+url)
	selector = get_clear_html_selector(url)
	detail = selector.xpath('//body//p')
	ret = ''
	for item in detail:
		tmp = item.xpath('string(.)').replace("&", "&amp;").replace('"', "&quot;").replace("<", "&lt;").replace(">", "&gt;")
		#tmp = item.xpath('string(.)').replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace("&", "&amp;").replace('"', "&quot;").replace("<", "&lt;").replace(">", "&gt;")
		if tmp and not tmp.startswith(u'返回') and not re.match('.*返回.*',tmp):
			ret+='&lt;br&gt;'+tmp
	return ret
Exemplo n.º 2
0
def execute_sql(sql,db=''):
	need_close = 0
	if not db:
		db = MySQLdb.connect(host='appdb',user='******',passwd='secret',db='homestead',charset="utf8")
		need_close = 1
	cursor = db.cursor()
	try:
		cursor.execute(sql)
		db.commit()
	except Exception as e:
		my_log('failt to execute:'+sql+' and error:'+str(e))
		db.rollback()
	finally:
		cursor.close()
	if need_close:
		db.close()
Exemplo n.º 3
0
def get_bible_translation_detail(url):
	my_log('now to get bible detail....'+url)
	# get detail
	# return struct: [{'chapter':1,'section':23,'version':'sdfsdfd','content':'ksdjfksdjfksjdf'},...]
	selector = get_clear_html_selector(url)
	detail = selector.xpath('//body//p')
	last_node = selector.xpath('//body/span')
	last_line = "".join(str.xpath('string(.)').replace('\r','').replace('\n','').replace('\t','') for str in last_node)
	cur_chapter = 0
	cur_section = 0
	ret = []
	for item in detail:
		info = item.xpath('string(.)').replace('\xa0','').replace('\r','').replace('\n','').replace('\t','').replace('\u3000','').replace(' ','')
		if not info:
			continue
		if info.startswith(u'返回') or info.endswith(u'简介'):
			continue
		if not info.startswith(u'('):
			is_new_section_1 = re.match('^(\d+)\S(\d+)\S$',info)
			if is_new_section_1 :#is new section
				cur_chapter = is_new_section_1.group(1)
				cur_section = is_new_section_1.group(2)
			else:
				is_new_section_2 = re.match('^\S+(\d+):(\d+)\S$',info)
				if is_new_section_2:
					cur_chapter = is_new_section_2.group(1)
					cur_section = is_new_section_2.group(2)
				else:
					is_new_section_3 = re.match('^(\d+)\S(\d+)\S\S+$',info)
					if is_new_section_3:
						chapters = get_chapters_from_type_3(info)
						for eachchapter in chapters:
							ret.append(eachchapter)
			continue
		cur_version = info[1:info.find(')')]
		cur_content = info[info.find(')')+1:]
		cur_info = {'chapter':cur_chapter,'section':cur_section,'version':cur_version,'content':cur_content}
		ret.append(cur_info)
	last_content = last_line[last_line.find(')')+1:]
	
	last_version = last_line[1:last_line.find(')')]
	ret.append({'chapter':cur_chapter,'section':cur_section,'version':last_version,'content':last_content})
	return ret
Exemplo n.º 4
0
def get_info(sql,info,db=''):
	need_close = 0
	
	if not db:	
		db = MySQLdb.connect(host='appdb',user='******',passwd='secret',db='homestead',charset="utf8")
		need_close = 1
	ret = -1
	cursor = db.cursor()
	try:
		cursor.execute(sql)
		ret = cursor.fetchone()
		if ret:
			ret = ret[0]
		else:
			ret = -1
		db.commit()
	except Exception as e:
		my_log('failt to get info:'+sql+' and error:'+str(e))
		db.rollback()
	finally:
		cursor.close()
	if need_close:
		db.close()
	return ret
Exemplo n.º 5
0
def get_list_to_chapter(url):
	my_log('get list_to chapter:'+url)
#'''
#生成一个字典结构为
# url 为不带scheme的
#{
#	[
#	'list_name':'A、《圣经译本》',
#	'list_content':[
#		{'chapter_name':'第一章','chapter_url':'abcdefff.htm'},
#		{'chapter_name':'第二章','chapter_url':'abcdefff.htm'},
#		...
#		],//end list_content
#	],//end list
#	[
#	'list_name':'B、《丁达尔解经》',
#	'list_content':[
#		{'chapter_name':'第一章','chapter_url':'abcdefff.htm'},
#		{'chapter_name':'第二章','chapter_url':'abcdefff.htm'},
#		...
#		],//end list_content
#	],//end list
#	...
#}
#'''
	ret = {}
	tmp_lists_content = []
	selector = get_clear_html_selector(url)
	chapter_name_list = selector.xpath('//div[@align="center"]//table')
	
	for each in chapter_name_list:
	#each 为所有的带着url的章列表 需要采集出对应的章名称和url 生成一个[{'chapter_name':'第一章','chapter_url':'abcdef.htm'},{'chapter_name':'第二章','chapter_url':'ahcsdf.htm'}...]
		cur_list = []
		lists = each.xpath('.//p//a')
		if lists:
			for each2 in lists:
				
				parent_path = parse.urlparse(url)[2]
				add_path = parent_path[1:parent_path.rfind('/')+1]
				
				cur_chapter_url = add_path+each2.xpath('@href')[0]
				cur_chapter_name = each2.xpath('string(.)').replace('\r','').replace('\n','').replace('\t','')
				cur_list.append({'chapter_name':cur_chapter_name,'chapter_url':cur_chapter_url})
		if cur_list:
			tmp_lists_content.append(cur_list)
	tmp_lists_name = []
	list_name = selector.xpath('//div[not(@align or @class)]//p')
	#list_name = selector.xpath('//div[@style="border-style: groove"]//p')
	for each in list_name:
		list_name = each.xpath('string(.)').replace('\r','').replace('\n','').replace('\t','')
		if list_name :
			
			if list_name.endswith(u'目录'):
				continue
			tmp_lists_name.append(list_name)
			#decoded_str = bytes.decode(category)
			#execute_sql('insert into test(test_string) values('+decoded_str+')')
			#ret.append({'category':category})
	lists_name_lenth = len(tmp_lists_name)
	chapter_name_list_lenth = len(tmp_lists_content)
	min_lenth = lists_name_lenth
	if min_lenth > chapter_name_list_lenth:
		
		min_lenth = chapter_name_list_lenth
	ret = []
	for i in range(0,min_lenth):
		cur_list = {'list_name':tmp_lists_name[i],'list_content':tmp_lists_content[i]}
		ret.append(cur_list)
	return ret
Exemplo n.º 6
0
def get_list_id(list_name,db):
	my_log('get list_id:'+list_name)
	sql = 'select id from list where name="'+list_name+'"'
	return get_info(sql,id,db)
Exemplo n.º 7
0
def get_volume_id(volume_name,db):
	my_log('get volume_id:'+volume_name)
	sql = 'select id from volume where name="'+volume_name+'"'
	return get_info(sql,id,db)