def main(argv) : press_name = argv[1] start_page_index = int(argv[2]) end_page_index = int(argv[3]) print press_name press = press_dict[press_name] con = db.connect_raw() for i in range(start_page_index, end_page_index+1) : # get 10-20 url and insert url_list = press.get_article_urls_with_pagenum(i) print "page: " + str(i) for url in url_list : print url try : article = press.parse_article_with_url(url) except : print 'retry parsing!' article = press.parse_article_with_url(url) query = db.make_insert_query("article", article) result = db.do_insert(con, query) time.sleep(1.5) time.sleep(5) con.close()
def main() : result = _get_raw_data() con = db.connect_dev() for row in result : # article table article = _make_article_info(con, row) query = db.make_insert_query('article', article) db.do_insert(con, query) # hooking keyword table raw_content = row[2] hook_words = _extract_hook_word(raw_content.encode('utf-8')) for word in hook_words.keys() : words_in_article = {'article_URL': article['URL'], 'hooking_keyword_id': word, 'count': hook_words[word]} query = db.make_insert_query('article_hooking_keyword', words_in_article) db.do_insert(con, query) # author table expected_author_string = row[5] author_list = _make_author_list(con, expected_author_string) for author in author_list : if not _is_author_exits(author['email']) : query = db.make_insert_query('author', author) db.do_insert(con, query) # Get author_id query = "SELECT id FROM author WHERE email=\'" + author['email'] + "\'" author_id = db.do_select(con, query) author_id = author_id[0][0] print author_id, author['name'], author['email'], author['press_id'] # article_author table article_author = {'article_URL': article['URL'], 'author_id': author_id} query = db.make_insert_query('article_author', article_author) db.do_insert(con, query)