def insert2(user_id,body,device_type,device_no,local_id,created_date,last_update): rows = list(dbw.select(table_name,what="pk_id,body", where="user_id=$user_id",vars=locals(),order="pk_id desc", limit=80)) for r in rows: if cmp(r.body,body)==0: return r.pk_id return dbw.insert(table_name,user_id=user_id,subject="",body=body, device_no = device_no, local_id=local_id, device_type=device_type, app_created_date=created_date, last_update=last_update, created_date = web.SQLLiteral('now()'), plan_start_date=web.SQLLiteral('now()'))
def compute_tf_idf(): #数据量非常小的情况下可以这样用,数据量大就需要编写hadoop脚本 dbw.query("""update terms as t, (SELECT term_id, count(*) as count FROM term_doc group by term_id) as tmp set t.count_domain = tmp.count where t.term_id=tmp.term_id;""") #update term's idf r = dbw.select('subjects',what="count(*) as count") #select count(*) as count from subjects; doc_total_count = r[0].count dbw.query("update terms set idf_domain=LOG(%s/(count_domain+1))" % doc_total_count ) #update term's tf dbw.query("""update term_doc as t, (SELECT doc_id, sum(term_count) as doc_term_count FROM term_doc group by doc_id) as tmp set t.tf = t.term_count/tmp.doc_term_count where t.doc_id = tmp.doc_id""") #update term's tf-idf dbw.query("""update term_doc as td, terms as t set td.tf_idf = td.tf*t.idf_domain where td.term_id = t.term_id """)
def compute_tf_idf(): #数据量非常小的情况下可以这样用,数据量大就需要编写hadoop脚本 dbw.query( """update terms as t, (SELECT term_id, count(*) as count FROM term_doc group by term_id) as tmp set t.count_domain = tmp.count where t.term_id=tmp.term_id;""") #update term's idf r = dbw.select( 'subjects', what="count(*) as count") #select count(*) as count from subjects; doc_total_count = r[0].count dbw.query("update terms set idf_domain=LOG(%s/(count_domain+1))" % doc_total_count) #update term's tf dbw.query("""update term_doc as t, (SELECT doc_id, sum(term_count) as doc_term_count FROM term_doc group by doc_id) as tmp set t.tf = t.term_count/tmp.doc_term_count where t.doc_id = tmp.doc_id""") #update term's tf-idf dbw.query("""update term_doc as td, terms as t set td.tf_idf = td.tf*t.idf_domain where td.term_id = t.term_id """)
def load_last_one(user_id): rows = list(dbw.select(table_name,what="pk_id,subject,body,task_status", where='user_id=$user_id',vars=locals())) if rows: return rows[0] return False