def get_duration(ndb): dh = db.DbHandler(ndb) event_name = dh.get_info_dic()["event_name"] r = get_begin_date_wikidata(event_name) senti = (datetime.date(1900, 1, 1), datetime.date(1900, 1, 1) + threemonth) dur = senti if not r == datetime.date(1900, 1, 1): dur = (r, r + threemonth) b = dh.get_info_dic()['begin_date'] b = dt.str2date(b) e = dh.get_info_dic()['end_date'] e = dt.str2date(e) if dur == senti: while b < e: if len(dh.get_ent_all_tag(b, b + threemonth, 100000)) > 0: dur = (b, b + threemonth) break b += threemonth if dur == senti: dur = (datetime.date(1987, 1, 1), datetime.date(1987, 1, 1) + threemonth) dh.insert_info('event_begin_date', dt.date2str(dur[0])) # print dt.date2str(dur[0]) # begin_event_date = dh.get_info_dic()['event_begin_date'] # print begin_event_date dh.insert_info('event_duration', dt.date2str(dur[0]) + '-' + dt.date2str(dur[1])) print "duration of ", ndb, " is ", dur[0], "-", dur[1] return dur
def make_csv_data(db_file_name): dh = db.DbHandler(db_file_name) queries = dh.select_various_names_of_query() ret = [['']] begin = BEGIN_DATE end = END_DATE threemonth = datetime.timedelta(days=92) periods = [] while begin < end: e = begin + threemonth ret[0].append(dt.date2str(begin) + '_' + dt.date2str(e)) periods.append((begin, e)) begin = e for q in queries: arts = get_articles(q) d = count_articles(arts) ret.append([]) ret[-1].append(q) for p in periods: ret[-1].append(d[p]) return ret
def get_entity_count(fdb): # Connect to DB and create table dcon = db.DbHandler(fdb) # begin_event_date = dcon.get_info_dic()['event_begin_date'] # begin_event_date = dt.str2date(begin_event_date) begin_event_date = BEGIN_DATE end_event_date = dcon.get_info_dic()['end_date'] end_event_date = dt.str2date(end_event_date) threemonth = datetime.timedelta(days=92) b = begin_event_date e = end_event_date output = [[ "period", "#mention", "#person(all)", "#person(new)", "#person(unique)", "#location(all)", "#location(new)", "#location(unique)", "#organization(all)", "#organization(new)", "#organization(unique)" ]] ps = set() ls = set() os = set() while b < e: d = dt.date2str(b) m = dcon.get_ref_num(b, b + threemonth) p = dcon.get_ent(b, b + threemonth, "PERSON", 100000000) l = dcon.get_ent(b, b + threemonth, "LOCATION", 100000000) o = dcon.get_ent(b, b + threemonth, "ORGANIZATION", 100000000) nps = ps.union(set(p)) nls = ls.union(set(l)) nos = os.union(set(o)) output.append([ d, m, len(p), len(nps) - len(ps), len(set(p)), len(l), len(nls) - len(ls), len(set(l)), len(o), len(nos) - len(os), len(set(o)) ]) ps = nps ls = nls os = nos b += threemonth return output
def make_out_data(get_data_func): threemonth = datetime.timedelta(days=92) dout = ["period"] dout.append("xxxx-xx-xx") dout.append("xxxx-xx") dout.append("xxxx") dout.append("PRESENT_REF") dout.append('xxxx-SU/SP/FA/WN') dout.append("PAST_REF") for i in range(-6, 0): dout.append("normalized_" + dout[-i]) dout = [dout] # make all_dict b = begin_date e = end_date while b < e: p = get_data_func(b, b + threemonth) p = map(lambda x: x[0], p) d = len(filter(is_date, p)) m = len(filter(is_mounth, p)) y = len(filter(is_year, p)) n = len(filter(is_present, p)) s = len(filter(is_season, p)) o = len(filter(is_past, p)) dout.append([dt.date2str(b), d, m, y, n, s, o, 0, 0, 0, 0, 0, 0]) b += threemonth for i in range(-6, 0): maxv = 0 for j in range(1, len(dout)): maxv = max(maxv, dout[j][i - 6]) if maxv != 0: for j in range(1, len(dout)): dout[j][i] = dout[j][i - 6] / float(maxv) return dout
help='begin date (YYYY-MM-DD)') ap.add_argument('-e', '--end-date', type=dt.str2date, default=END_DATE, help='end date (YYYY-MM-DD)') ap.add_argument('-i', '--include-neighbor', type=int, default=INCLUDE_NEIGHBOR, help='# of neighbor sentences to include') ap.add_argument('-m', '--reference-type', default=REFERENCE_TYPE, choices='wpd', help='Type of reference (whole/part/divide)') args = ap.parse_args() print 'DB: {0}'.format(args.db_file_name) print 'Solr URL: {0}'.format(args.solr_url) print 'article begin date: {0}'.format(dt.date2str(args.begin_date)) print 'article end date: {0}'.format(dt.date2str(args.end_date)) print 'neighbor sentences: {0}'.format(args.include_neighbor) print 'reference type: {0}'.format(args.reference_type) print '-' * 40 # Connect to DB and create table dh = db.DbHandler(args.db_file_name) dh.create_table_article() dh.create_table_reference() # Set Info print 'Setting info' dh.insert_info('begin_date', dt.date2str(args.begin_date)) dh.insert_info('end_date', dt.date2str(args.end_date)) dh.insert_info('include_neighbor', str(args.include_neighbor))
from util import wk import wikidate import os.path if __name__ == '__main__': # Parse args ap = argparse.ArgumentParser(description='Get event info.') ap.add_argument('db_file_name', help='SQLite3 DB file name') ap.add_argument('event_name', help='event name') args = ap.parse_args() print 'DB: {0}'.format(args.db_file_name) print 'event name: {0}'.format(args.event_name) event_begin_date = dt.date2str(wikidate.get_begin_date(args.event_name)) end_date = '2007-06-20' print '-' * 40 # Get black list of word black_list = [] if os.path.exists("./black_words_list/" + args.event_name.replace(" ", "")): f = open("./black_words_list/" + args.event_name.replace(" ", ""), 'r') for r in f: black_list.append(r.replace("\n", "")) print "black list is ", black_list # Connect to DB and create table dh = db.DbHandler(args.db_file_name)
def get_top_entities_list(ndb, tag): print "get_top_entitites_list-get_top_entities_list" # Connect to DB and create table dh = db.DbHandler(ndb) # begin_event_date = dh.get_info_dic()['event_begin_date'] # begin_event_date = dt.str2date(begin_event_date) begin_event_date = BEGIN_DATE end_event_date = dh.get_info_dic()['end_date'] end_event_date = dt.str2date(end_event_date) # begin_date = dh.get_info_dic()['begin_date'] # begin_date = dt.str2date(begin_date) end_date = dh.get_info_dic()['end_date'] end_date = dt.str2date(end_date) get_data_func = (lambda b, e: dh.get_ent(b, e, tag, 1000000000)) dout = ["period"] for i in range(0, 10): dout.append("Name") dout.append("#") dout.append("one_to_before_one_cosine_sim") dout.append("one_to_all_cosine_sim") dout.append("one_to_all_before_cosine_sim") dout.append("one_to_all_future_cosine_sim") dout.append("one_to_first_three_mounth_sim") dout.append("one_to_peak_three_mounth_sim") dout.append("one_to_duration_sim") dout.append("entropy") dout.append("normalized_" + dout[-1]) dout = [dout] all_dict = get_all_dict(begin_event_date, end_event_date, get_data_func) future_dict = dict() past_dict = dict() before_dict = dict() first_dict = dict() peak_dict = dict() dur_dict = dict() # make all_dict print "make all_dict" b = begin_event_date e = end_date peak_mention = get_entity_count.peak_period_mention(ndb) # peak_begin = 0 # peak_end = 0 dur = duration_event.get_duration(ndb) # print "get_top_entities_list first loop" while b < e: # print "get_top_entities_list first loop", b p = get_data_func(b, b + threemonth) ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))), key=lambda x: x[1], reverse=True) if begin_event_date <= b and begin_event_date < b + threemonth: for d in ds: if d[0] in first_dict.keys(): first_dict[d[0]] += d[1] else: first_dict[d[0]] = d[1] if b < peak_mention and peak_mention <= b + threemonth: for d in ds: peak_dict[d[0]] = d[1] if dur[0] <= b and b + threemonth <= dur[1]: for d in ds: if d[0] in dur_dict.keys(): dur_dict[d[0]] += d[1] else: dur_dict[d[0]] = d[1] b += threemonth future_dict = copy.deepcopy(all_dict) b = begin_event_date e = end_date print "get_top_entities_list second loop" while b < e: # print "get_top_entities_list second loop", b do = [dt.date2str(b)] p = get_data_func(b, b + threemonth) ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))), key=lambda x: x[1], reverse=True) now_dict = dict() for d in ds: now_dict[d[0]] = d[1] for d in ds[0:10]: do.append(d[0]) do.append(d[1]) for i in range(0, 10 - len(ds[0:10])): do.append("") do.append("") future_dict = sub_dict(future_dict, now_dict) do.append(cosine_sim(now_dict, before_dict)) do.append(cosine_sim(now_dict, all_dict)) do.append(cosine_sim(now_dict, past_dict)) do.append(cosine_sim(now_dict, future_dict)) do.append(cosine_sim(now_dict, first_dict)) do.append(cosine_sim(now_dict, peak_dict)) do.append(cosine_sim(now_dict, dur_dict)) do.append(entropy(now_dict)) do.append(0) dout.append(do) before_dict = now_dict past_dict = add_dict(past_dict, now_dict) b += threemonth # Calculate normalized_entropy maxv = 0 for i in range(1, len(dout)): maxv = max(maxv, dout[i][column_entropy]) if maxv != 0: for i in range(1, len(dout)): dout[i][column_entropy + 1] = dout[i][column_entropy] / float(maxv) return dout
def make_out_data(get_data_func): ti = tf_idf.TF_IDF(get_data_func, begin_event_date, end_date) threemonth = datetime.timedelta(days=92) dout = ["period"] dout.append("tf_idf_one_to_before_one_cosine_sim") dout.append("tf_idf_one_to_all_cosine_sim") dout.append("tf_idf_one_to_all_before_cosine_sim") dout.append("tf_idf_one_to_all_future_cosine_sim") dout.append("tf_idf_one_to_first_three_mounth_sim") dout.append("tf_idf_one_to_peak_three_mounth_sim") dout.append("tf_idf_one_to_duration_sim") dout = [dout] b = begin_event_date e = end_date peak_mention = get_entity_count.peak_period_mention(ndb) peak_begin = 0 peak_end = 0 while b < e: if b <= peak_mention and peak_mention <= b + threemonth: peak_begin = b peak_end = b + threemonth b += threemonth b = begin_event_date e = end_date dur = duration_event.get_duration(ndb) while b < e: print "tf_idf_sheet-second-loop", b do = [dt.date2str(b)] now_tiv = ti.get_tf_idf_vector(b, b + threemonth) # tf_idf_one_to_before_one_cosine_sim if not b == begin_event_date: do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(b - threemonth, b))) else: do.append(0) # tf_idf_one_to_all_cosine_sim do.append(cosine_sim(now_tiv, ti.tf_idf_all)) # tf_idf_one_to_all_before_cosine_sim c = cosine_sim( now_tiv, ti.get_tf_idf_vector(begin_event_date, b)) print "tf_idf_one_to_all_before_cosine_sim = ", c, begin_event_date, b print "now_tiv = " print tf_idf.show_tf_idf_dict(now_tiv) print "ti.get_tf_idf_vector(begin_event_date, b) = " print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b)) print c if not b == begin_event_date: do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(begin_event_date, b))) else: do.append(0) # tf_idf_one_to_all_future_cosine_sim c = cosine_sim( now_tiv, ti.get_tf_idf_vector(b + threemonth, e)) print "tf_idf_one_to_all_future_cosine_sim = ", c, b + threemonth, e print "now_tiv = " print tf_idf.show_tf_idf_dict(now_tiv) print "ti.get_tf_idf_vector(b + threemonth, e) = " print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b)) print c # print c do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(b + threemonth, e))) # tf_idf_one_to_first_three_mounth_sim do.append(cosine_sim(now_tiv, ti.get_tf_idf_vector( begin_event_date, begin_event_date + threemonth))) # tf_idf_one_to_peak_three_mounth_sim do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(peak_begin, peak_end))) # tf_idf_one_to_duration_sim do.append(cosine_sim( now_tiv, ti.get_tf_idf_vector(dur[0], dur[1]))) dout.append(do) b += threemonth return dout
ap = argparse.ArgumentParser(description='Get event info.') ap.add_argument('db_file_name', help='SQLite3 DB file name') ap.add_argument('event_name', help='event name') ap.add_argument('event_begin_date', type=dt.str2date, help='event begin date') ap.add_argument('event_end_date', type=dt.str2date, help='event end date') args = ap.parse_args() print 'DB: {0}'.format(args.db_file_name) print 'event name: {0}'.format(args.event_name) print 'event begin date: {0}'.format(dt.date2str(args.event_begin_date)) print 'event end date: {0}'.format(dt.date2str(args.event_end_date)) print '-' * 40 # Connect to DB and create table dcon = db.DbHandler(args.db_file_name) threemonth = datetime.timedelta(days=92) # b = args.event_begin_date b = BEGIN_DATE e = args.event_end_date output = [[ "period", "#mention", "#person(all)", "#person(new)", "#person(unique)", "#location(all)", "#location(new)", "#location(unique)", "#organization(all)", "#organization(new)", "#organization(unique)"
def __join_text_date(self, text, date): ''' Join text and date. ''' date_s = dt.date2str(date) return '[<date>{0}</date>]\n{1}'.format(date_s, text)
def set_peak(ndb): peak_mention = get_entity_count.peak_period_mention(ndb) dh = db.DbHandler(ndb) dh.insert_info('peak_mention', dt.date2str(peak_mention))