def main(argv) :
	press_name = argv[1]
	start_page_index = int(argv[2])
	end_page_index = int(argv[3])
	print press_name
	
	press = press_dict[press_name]
	
	con = db.connect_raw()

	for i in range(start_page_index, end_page_index+1) :
		# get 10-20 url and insert
		url_list = press.get_article_urls_with_pagenum(i)

		print "page: " + str(i)

		for url in url_list :
			print url
			try :
				article = press.parse_article_with_url(url)
			except :
				print 'retry parsing!'
				article = press.parse_article_with_url(url)
			
			query = db.make_insert_query("article", article)
			result = db.do_insert(con, query)
			time.sleep(1.5)
		time.sleep(5)
	con.close()
Пример #2
0
def __extract_author(con, email) :
	"""
	id, name, email, press_id, added_date
	"""
	author = {}

	con_r = db.connect_raw()
	query = "SELECT URL, author_info FROM article WHERE author_info like \'%" + email + "%\'"
	result = db.do_select(con_r, query)
	url = result[0][0].split('/')[2]

	# SUM ALL POSSIBLE NAME
	possible_words = {}
	for row in result :
		author_info = row[1]
		possible_words_in_article = __extract_name(author_info)
		for k, v in possible_words_in_article.items() :
			possible_words[k] = possible_words.get(k, 0) + v

	max_value = 0
	name = ""
	for key, value in possible_words.items() :
		if (value > max_value) :
			if key == u'' :
				continue
			name = key
			max_value = value

	if max_value < 3 :
		return None

	author['name'] = name.encode('utf-8')
	author['email'] = email.encode('utf-8')
	author['press_id'] = __get_press_id_from(url)
	author['added_date'] = str(__get_today()).encode('utf-8')
	
	return author		
Пример #3
0
def _get_raw_data() :
	con_r = db.connect_raw()
	query = 'SELECT * FROM article'
	result = db.do_select(con_r, query)
	con_r.close()
	return result