def get_top_entities_list(ndb, tag): print "get_top_entitites_list-get_top_entities_list" # Connect to DB and create table dh = db.DbHandler(ndb) # begin_event_date = dh.get_info_dic()['event_begin_date'] # begin_event_date = dt.str2date(begin_event_date) begin_event_date = BEGIN_DATE end_event_date = dh.get_info_dic()['end_date'] end_event_date = dt.str2date(end_event_date) # begin_date = dh.get_info_dic()['begin_date'] # begin_date = dt.str2date(begin_date) end_date = dh.get_info_dic()['end_date'] end_date = dt.str2date(end_date) get_data_func = (lambda b, e: dh.get_ent(b, e, tag, 1000000000)) dout = ["period"] for i in range(0, 10): dout.append("Name") dout.append("#") dout.append("one_to_before_one_cosine_sim") dout.append("one_to_all_cosine_sim") dout.append("one_to_all_before_cosine_sim") dout.append("one_to_all_future_cosine_sim") dout.append("one_to_first_three_mounth_sim") dout.append("one_to_peak_three_mounth_sim") dout.append("one_to_duration_sim") dout.append("entropy") dout.append("normalized_" + dout[-1]) dout = [dout] all_dict = get_all_dict(begin_event_date, end_event_date, get_data_func) future_dict = dict() past_dict = dict() before_dict = dict() first_dict = dict() peak_dict = dict() dur_dict = dict() # make all_dict print "make all_dict" b = begin_event_date e = end_date peak_mention = get_entity_count.peak_period_mention(ndb) # peak_begin = 0 # peak_end = 0 dur = duration_event.get_duration(ndb) # print "get_top_entities_list first loop" while b < e: # print "get_top_entities_list first loop", b p = get_data_func(b, b + threemonth) ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))), key=lambda x: x[1], reverse=True) if begin_event_date <= b and begin_event_date < b + threemonth: for d in ds: if d[0] in first_dict.keys(): first_dict[d[0]] += d[1] else: first_dict[d[0]] = d[1] if b < peak_mention and peak_mention <= b + threemonth: for d in ds: peak_dict[d[0]] = d[1] if dur[0] <= b and b + threemonth <= dur[1]: for d in ds: if d[0] in dur_dict.keys(): dur_dict[d[0]] += d[1] else: dur_dict[d[0]] = d[1] b += threemonth future_dict = copy.deepcopy(all_dict) b = begin_event_date e = end_date print "get_top_entities_list second loop" while b < e: # print "get_top_entities_list second loop", b do = [dt.date2str(b)] p = get_data_func(b, b + threemonth) ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))), key=lambda x: x[1], reverse=True) now_dict = dict() for d in ds: now_dict[d[0]] = d[1] for d in ds[0:10]: do.append(d[0]) do.append(d[1]) for i in range(0, 10 - len(ds[0:10])): do.append("") do.append("") future_dict = sub_dict(future_dict, now_dict) do.append(cosine_sim(now_dict, before_dict)) do.append(cosine_sim(now_dict, all_dict)) do.append(cosine_sim(now_dict, past_dict)) do.append(cosine_sim(now_dict, future_dict)) do.append(cosine_sim(now_dict, first_dict)) do.append(cosine_sim(now_dict, peak_dict)) do.append(cosine_sim(now_dict, dur_dict)) do.append(entropy(now_dict)) do.append(0) dout.append(do) before_dict = now_dict past_dict = add_dict(past_dict, now_dict) b += threemonth # Calculate normalized_entropy maxv = 0 for i in range(1, len(dout)): maxv = max(maxv, dout[i][column_entropy]) if maxv != 0: for i in range(1, len(dout)): dout[i][column_entropy + 1] = dout[i][column_entropy] / float(maxv) return dout
print 'DB: {0}'.format(args.db_file_name) if not args.xlsx_file_name: args.xlsx_file_name = args.db_file_name.split('.')[0] + '.xlsx' print 'Excel workbook: {0}'.format(args.xlsx_file_name) print '# of months / period: {0}'.format(args.month_delta) print '# of top word: {0}'.format(args.lim_word) print '# of high score mentions: {0}'.format(args.lim_hscore) print '-' * 40 # Create DB handler and xlsx workbook xh = xlsx.XlsxHandler(args.db_file_name, args.xlsx_file_name, args.month_delta) try: # Set duration info to database duration_event.get_duration(args.db_file_name) # Set peak_mention info to database set_peak_to_info.set_peak(args.db_file_name) if int(xh.info_dic["event_doc_num"]) == 0: raise sqlite3.OperationalError( 'int(xh.info_dic["event_doc_num"]) == 0') # Write sheets print 'Writing "info" sheet' xh.write_info() print "Writing entity_count sheet" dout = get_entity_count.get_entity_count(args.db_file_name) xh.write_csv_date("entity_count", dout)
def make_out_data(get_data_func): ti = tf_idf.TF_IDF(get_data_func, begin_event_date, end_date) threemonth = datetime.timedelta(days=92) dout = ["period"] dout.append("tf_idf_one_to_before_one_cosine_sim") dout.append("tf_idf_one_to_all_cosine_sim") dout.append("tf_idf_one_to_all_before_cosine_sim") dout.append("tf_idf_one_to_all_future_cosine_sim") dout.append("tf_idf_one_to_first_three_mounth_sim") dout.append("tf_idf_one_to_peak_three_mounth_sim") dout.append("tf_idf_one_to_duration_sim") dout = [dout] b = begin_event_date e = end_date peak_mention = get_entity_count.peak_period_mention(ndb) peak_begin = 0 peak_end = 0 while b < e: if b <= peak_mention and peak_mention <= b + threemonth: peak_begin = b peak_end = b + threemonth b += threemonth b = begin_event_date e = end_date dur = duration_event.get_duration(ndb) while b < e: print "tf_idf_sheet-second-loop", b do = [dt.date2str(b)] now_tiv = ti.get_tf_idf_vector(b, b + threemonth) # tf_idf_one_to_before_one_cosine_sim if not b == begin_event_date: do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(b - threemonth, b))) else: do.append(0) # tf_idf_one_to_all_cosine_sim do.append(cosine_sim(now_tiv, ti.tf_idf_all)) # tf_idf_one_to_all_before_cosine_sim c = cosine_sim( now_tiv, ti.get_tf_idf_vector(begin_event_date, b)) print "tf_idf_one_to_all_before_cosine_sim = ", c, begin_event_date, b print "now_tiv = " print tf_idf.show_tf_idf_dict(now_tiv) print "ti.get_tf_idf_vector(begin_event_date, b) = " print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b)) print c if not b == begin_event_date: do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(begin_event_date, b))) else: do.append(0) # tf_idf_one_to_all_future_cosine_sim c = cosine_sim( now_tiv, ti.get_tf_idf_vector(b + threemonth, e)) print "tf_idf_one_to_all_future_cosine_sim = ", c, b + threemonth, e print "now_tiv = " print tf_idf.show_tf_idf_dict(now_tiv) print "ti.get_tf_idf_vector(b + threemonth, e) = " print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b)) print c # print c do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(b + threemonth, e))) # tf_idf_one_to_first_three_mounth_sim do.append(cosine_sim(now_tiv, ti.get_tf_idf_vector( begin_event_date, begin_event_date + threemonth))) # tf_idf_one_to_peak_three_mounth_sim do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(peak_begin, peak_end))) # tf_idf_one_to_duration_sim do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(dur[0], dur[1]))) dout.append(do) b += threemonth return dout
def ndb_to_xlsx_data(ndb): pickle_name = ndb + ".pickle" if os.path.exists(pickle_name): return xlsx_data = [] try: # Set duration info to database dur = duration_event.get_duration(ndb) # Set peak_mention info to database set_peak_to_info.set_peak(ndb) dh = db.DbHandler(ndb) if int(dh.get_info_dic()["event_doc_num"]) == 0: print 'int(xh.info_dic["event_doc_num"]) == 0' raise sqlite3.OperationalError( 'int(xh.info_dic["event_doc_num"]) == 0') print "Start for ", ndb print "Writing entity_count sheet" dout = get_entity_count.get_entity_count(ndb) xlsx_data.append(("entity_count", (dout, dur))) print "Writing various_name sheet" dout = various_name.make_csv_data(ndb) xlsx_data.append(("various_name", (dout, dur))) print "Writing #articles_for_each_query sheet" dout = query_num_sheet.make_csv_data(ndb) xlsx_data.append(("articles_for_each_query", (dout, dur))) print "Writing top_entities_list_person sheet" dout = get_top_entities_list.get_sheet(ndb, "PERSON") xlsx_data.append(("top_entities_list_person", (dout, dur))) print "Writing top_entities_list_location" dout = get_top_entities_list.get_sheet(ndb, "LOCATION") xlsx_data.append(("top_entities_list_location", (dout, dur))) print "Writing top_entities_list_organization" dout = get_top_entities_list.get_sheet(ndb, "ORGANIZATION") xlsx_data.append(("top_entities_list_organization", (dout, dur))) print "Writing top_entities_list_date" dout = get_top_entities_list.get_sheet(ndb, "DATE") xlsx_data.append(("top_entities_list_date", (dout, dur))) print "Writing tf_idf sheet." dout = tf_idf_sheet.make_tf_idf_sheet(ndb) xlsx_data.append(("tf_idf", (dout, dur))) print 'Done', ndb, '\n\n' except: traceback.print_exc() print "Cannot make xlsx. because" print "Remove trash xlsx file\n\n" with open(pickle_name, 'w') as f: pickle.dump(xlsx_data, f) return xlsx_data