def main(argv) :
	press_name = argv[1]
	start_page_index = int(argv[2])
	end_page_index = int(argv[3])
	print press_name
	
	press = press_dict[press_name]
	
	con = db.connect_raw()

	for i in range(start_page_index, end_page_index+1) :
		# get 10-20 url and insert
		url_list = press.get_article_urls_with_pagenum(i)

		print "page: " + str(i)

		for url in url_list :
			print url
			try :
				article = press.parse_article_with_url(url)
			except :
				print 'retry parsing!'
				article = press.parse_article_with_url(url)
			
			query = db.make_insert_query("article", article)
			result = db.do_insert(con, query)
			time.sleep(1.5)
		time.sleep(5)
	con.close()
示例#2
0
def _is_author_exits(email) :
	con_d = db.connect_dev()
	query = 'SELECT id FROM author WHERE email=\'' + email + '\''
	result = db.do_select(con_d, query)
	con_d.close()
	
	if len(result) == 0 :
		return False
	return True
示例#3
0
def __get_hook_word_dict() :
	con = db.connect_dev()
	keyword_dict = {}
	query = "SELECT * FROM hooking_keyword"
	result = db.do_select(con, query)
	
	con.close()

	for (id, word) in result :
		keyword_dict[word.encode('utf-8')] = id
	return keyword_dict
示例#4
0
def __get_section_id(con, section_name) :
	section_keywords = re.findall(u"[가-힣]+", section_name)
	kwd_num = len(section_keywords)

	# 일치하는 키워드가 전혀 없을 때
	if kwd_num == 0 :
		return 0
	kwd_idx = kwd_num - 1

	while True :
		last_keyword = section_keywords[kwd_idx]
		# 일반: 분류할 수 없음
		if last_keyword == u'일반' :
			kwd_idx -= 1
			continue

		# 키워드 탐색
		query = "SELECT id FROM section WHERE name LIKE \'%" + last_keyword + "%\'"
		result = db.do_select(con, query)

		# 일치 키워드를 찾았을 때, 반환
		if len(result) > 0 :
			return result[0][0]
		# 찾지 못했을 때, 상위 키워드로 이동
		kwd_idx -= 1
		# 일치하는 키워드가 전혀 없을 때
		if kwd_idx < 0 :
			return 0
示例#5
0
def __get_if_email_exits(con, email) :
	query = 'SELECT * from author WHERE email = \'' + email + '\''
	result = db.do_select(con, query)
	if len(result) == 0 :
		return None
	else :
		author = {}
		for row in result :
			author['id'] = row[0]
			author['name'] = row[1]
			author['email'] = row[2]
			author['press_id'] = row[3]
			author['added_date'] = row[4]
		return author
示例#6
0
def __extract_author(con, email) :
	"""
	id, name, email, press_id, added_date
	"""
	author = {}

	con_r = db.connect_raw()
	query = "SELECT URL, author_info FROM article WHERE author_info like \'%" + email + "%\'"
	result = db.do_select(con_r, query)
	url = result[0][0].split('/')[2]

	# SUM ALL POSSIBLE NAME
	possible_words = {}
	for row in result :
		author_info = row[1]
		possible_words_in_article = __extract_name(author_info)
		for k, v in possible_words_in_article.items() :
			possible_words[k] = possible_words.get(k, 0) + v

	max_value = 0
	name = ""
	for key, value in possible_words.items() :
		if (value > max_value) :
			if key == u'' :
				continue
			name = key
			max_value = value

	if max_value < 3 :
		return None

	author['name'] = name.encode('utf-8')
	author['email'] = email.encode('utf-8')
	author['press_id'] = __get_press_id_from(url)
	author['added_date'] = str(__get_today()).encode('utf-8')
	
	return author		
示例#7
0
from tkinter import *
from tkinter import ttk
import datetime
import time
import DB_connector
from tkinter import messagebox

db = DB_connector.db_connect()
ID = -1

root = Tk()
root.title("Pro_Timer")
root.geometry("930x240")
root.resizable(0, 0)
style = ttk.Style()
style.configure("Treeview.Heading", font=(None, 15))
style.configure("mystyle.Treeview", font=(None, 12))

# fram1
frm1 = ttk.Frame(root)
frm1.pack(padx=15, pady=15, side=RIGHT)
frm1.config(width=200, height=40, relief=RIDGE)
# fram2
frm2 = ttk.Frame(root)
frm2.pack(pady=15, side=BOTTOM)
frm2.config(width=200, height=50, relief=RIDGE)

But_in = ttk.Button(frm1, text="IN")
But_in.grid(row=1, column=0, pady=15, padx=15, sticky='snew')

But_out = ttk.Button(frm1, text="OUT")
示例#8
0
def _get_raw_data() :
	con_r = db.connect_raw()
	query = 'SELECT * FROM article'
	result = db.do_select(con_r, query)
	con_r.close()
	return result
示例#9
0
def __get_press_id_from(url) :
	con_d = db.connect_dev()
	query = 'SELECT id FROM press WHERE domain=\'' + url + '\''
	result = db.do_select(con_d, query)
	con_d.close()
	return result[0][0]
示例#10
0
def main() :
	result = _get_raw_data()

	con = db.connect_dev()

	for row in result :
		# article table
		article = _make_article_info(con, row)
		query = db.make_insert_query('article', article)
		db.do_insert(con, query)

		# hooking keyword table
		raw_content = row[2]
		hook_words = _extract_hook_word(raw_content.encode('utf-8'))
		for word in hook_words.keys() :
			words_in_article = {'article_URL': article['URL'], 'hooking_keyword_id': word, 'count': hook_words[word]}
			query = db.make_insert_query('article_hooking_keyword', words_in_article)
			db.do_insert(con, query)
		
		# author table
		expected_author_string = row[5]
		author_list = _make_author_list(con, expected_author_string)

		for author in author_list :
			if not _is_author_exits(author['email']) :
				query = db.make_insert_query('author', author)
				db.do_insert(con, query)
			# Get author_id
			query = "SELECT id FROM author WHERE email=\'" + author['email'] + "\'"
			author_id = db.do_select(con, query)
			author_id = author_id[0][0]
			print author_id, author['name'], author['email'], author['press_id']

		# article_author table
			article_author = {'article_URL': article['URL'], 'author_id': author_id}
			query = db.make_insert_query('article_author', article_author)
			db.do_insert(con, query)