def main(argv) :
	press_name = argv[1]
	start_page_index = int(argv[2])
	end_page_index = int(argv[3])
	print press_name
	
	press = press_dict[press_name]
	
	con = db.connect_raw()

	for i in range(start_page_index, end_page_index+1) :
		# get 10-20 url and insert
		url_list = press.get_article_urls_with_pagenum(i)

		print "page: " + str(i)

		for url in url_list :
			print url
			try :
				article = press.parse_article_with_url(url)
			except :
				print 'retry parsing!'
				article = press.parse_article_with_url(url)
			
			query = db.make_insert_query("article", article)
			result = db.do_insert(con, query)
			time.sleep(1.5)
		time.sleep(5)
	con.close()
Пример #2
0
def main() :
	result = _get_raw_data()

	con = db.connect_dev()

	for row in result :
		# article table
		article = _make_article_info(con, row)
		query = db.make_insert_query('article', article)
		db.do_insert(con, query)

		# hooking keyword table
		raw_content = row[2]
		hook_words = _extract_hook_word(raw_content.encode('utf-8'))
		for word in hook_words.keys() :
			words_in_article = {'article_URL': article['URL'], 'hooking_keyword_id': word, 'count': hook_words[word]}
			query = db.make_insert_query('article_hooking_keyword', words_in_article)
			db.do_insert(con, query)
		
		# author table
		expected_author_string = row[5]
		author_list = _make_author_list(con, expected_author_string)

		for author in author_list :
			if not _is_author_exits(author['email']) :
				query = db.make_insert_query('author', author)
				db.do_insert(con, query)
			# Get author_id
			query = "SELECT id FROM author WHERE email=\'" + author['email'] + "\'"
			author_id = db.do_select(con, query)
			author_id = author_id[0][0]
			print author_id, author['name'], author['email'], author['press_id']

		# article_author table
			article_author = {'article_URL': article['URL'], 'author_id': author_id}
			query = db.make_insert_query('article_author', article_author)
			db.do_insert(con, query)