Пример #1
0
def get_duration(ndb):
    dh = db.DbHandler(ndb)
    event_name = dh.get_info_dic()["event_name"]
    r = get_begin_date_wikidata(event_name)
    senti = (datetime.date(1900, 1, 1), datetime.date(1900, 1, 1) + threemonth)
    dur = senti
    if not r == datetime.date(1900, 1, 1):
        dur = (r, r + threemonth)
    b = dh.get_info_dic()['begin_date']
    b = dt.str2date(b)
    e = dh.get_info_dic()['end_date']
    e = dt.str2date(e)
    if dur == senti:
        while b < e:
            if len(dh.get_ent_all_tag(b, b + threemonth, 100000)) > 0:
                dur = (b, b + threemonth)
                break
            b += threemonth
    if dur == senti:
        dur = (datetime.date(1987, 1,
                             1), datetime.date(1987, 1, 1) + threemonth)
    dh.insert_info('event_begin_date', dt.date2str(dur[0]))
    # print dt.date2str(dur[0])
    # begin_event_date = dh.get_info_dic()['event_begin_date']
    # print begin_event_date
    dh.insert_info('event_duration',
                   dt.date2str(dur[0]) + '-' + dt.date2str(dur[1]))
    print "duration of ", ndb, " is ", dur[0], "-", dur[1]
    return dur
Пример #2
0
def make_csv_data(db_file_name):
    dh = db.DbHandler(db_file_name)
    queries = dh.select_various_names_of_query()
    ret = [['']]

    begin = BEGIN_DATE
    end = END_DATE
    threemonth = datetime.timedelta(days=92)
    periods = []
    while begin < end:
        e = begin + threemonth
        ret[0].append(dt.date2str(begin) + '_' + dt.date2str(e))
        periods.append((begin, e))
        begin = e

    for q in queries:
        arts = get_articles(q)
        d = count_articles(arts)
        ret.append([])
        ret[-1].append(q)
        for p in periods:
            ret[-1].append(d[p])
    return ret
Пример #3
0
def get_entity_count(fdb):
    # Connect to DB and create table
    dcon = db.DbHandler(fdb)
    # begin_event_date = dcon.get_info_dic()['event_begin_date']
    # begin_event_date = dt.str2date(begin_event_date)
    begin_event_date = BEGIN_DATE
    end_event_date = dcon.get_info_dic()['end_date']
    end_event_date = dt.str2date(end_event_date)

    threemonth = datetime.timedelta(days=92)
    b = begin_event_date
    e = end_event_date
    output = [[
        "period", "#mention", "#person(all)", "#person(new)",
        "#person(unique)", "#location(all)", "#location(new)",
        "#location(unique)", "#organization(all)", "#organization(new)",
        "#organization(unique)"
    ]]
    ps = set()
    ls = set()
    os = set()
    while b < e:
        d = dt.date2str(b)
        m = dcon.get_ref_num(b, b + threemonth)
        p = dcon.get_ent(b, b + threemonth, "PERSON", 100000000)
        l = dcon.get_ent(b, b + threemonth, "LOCATION", 100000000)
        o = dcon.get_ent(b, b + threemonth, "ORGANIZATION", 100000000)
        nps = ps.union(set(p))
        nls = ls.union(set(l))
        nos = os.union(set(o))
        output.append([
            d, m,
            len(p),
            len(nps) - len(ps),
            len(set(p)),
            len(l),
            len(nls) - len(ls),
            len(set(l)),
            len(o),
            len(nos) - len(os),
            len(set(o))
        ])
        ps = nps
        ls = nls
        os = nos
        b += threemonth
    return output
Пример #4
0
    def make_out_data(get_data_func):
        threemonth = datetime.timedelta(days=92)
        dout = ["period"]
        dout.append("xxxx-xx-xx")
        dout.append("xxxx-xx")
        dout.append("xxxx")
        dout.append("PRESENT_REF")
        dout.append('xxxx-SU/SP/FA/WN')
        dout.append("PAST_REF")
        for i in range(-6, 0):
            dout.append("normalized_" + dout[-i])
        dout = [dout]

        # make all_dict
        b = begin_date
        e = end_date
        while b < e:
            p = get_data_func(b, b + threemonth)
            p = map(lambda x: x[0], p)
            d = len(filter(is_date, p))
            m = len(filter(is_mounth, p))
            y = len(filter(is_year, p))
            n = len(filter(is_present, p))
            s = len(filter(is_season, p))
            o = len(filter(is_past, p))
            dout.append([dt.date2str(b), d, m, y, n, s, o, 0, 0, 0, 0, 0, 0])

            b += threemonth
        for i in range(-6, 0):
            maxv = 0
            for j in range(1, len(dout)):
                maxv = max(maxv, dout[j][i - 6])
            if maxv != 0:
                for j in range(1, len(dout)):
                    dout[j][i] = dout[j][i - 6] / float(maxv)

        return dout
Пример #5
0
        help='begin date (YYYY-MM-DD)')

    ap.add_argument('-e', '--end-date', type=dt.str2date, default=END_DATE,
        help='end date (YYYY-MM-DD)')

    ap.add_argument('-i', '--include-neighbor',
        type=int, default=INCLUDE_NEIGHBOR,
        help='# of neighbor sentences to include')

    ap.add_argument('-m', '--reference-type', default=REFERENCE_TYPE, choices='wpd',
        help='Type of reference (whole/part/divide)')

    args = ap.parse_args()
    print 'DB: {0}'.format(args.db_file_name)
    print 'Solr URL: {0}'.format(args.solr_url)
    print 'article begin date: {0}'.format(dt.date2str(args.begin_date))
    print 'article   end date:   {0}'.format(dt.date2str(args.end_date))
    print 'neighbor sentences: {0}'.format(args.include_neighbor)
    print 'reference type: {0}'.format(args.reference_type)
    print '-' * 40

    # Connect to DB and create table
    dh = db.DbHandler(args.db_file_name)
    dh.create_table_article()
    dh.create_table_reference()

    # Set Info
    print 'Setting info'
    dh.insert_info('begin_date', dt.date2str(args.begin_date))
    dh.insert_info('end_date', dt.date2str(args.end_date))
    dh.insert_info('include_neighbor', str(args.include_neighbor))
Пример #6
0
from util import wk
import wikidate
import os.path

if __name__ == '__main__':
    # Parse args
    ap = argparse.ArgumentParser(description='Get event info.')

    ap.add_argument('db_file_name', help='SQLite3 DB file name')

    ap.add_argument('event_name', help='event name')

    args = ap.parse_args()
    print 'DB: {0}'.format(args.db_file_name)
    print 'event name: {0}'.format(args.event_name)
    event_begin_date = dt.date2str(wikidate.get_begin_date(args.event_name))
    end_date = '2007-06-20'
    print '-' * 40

    # Get black list of word
    black_list = []
    if os.path.exists("./black_words_list/" +
                      args.event_name.replace(" ", "")):
        f = open("./black_words_list/" + args.event_name.replace(" ", ""), 'r')
        for r in f:
            black_list.append(r.replace("\n", ""))
    print "black list is ", black_list

    # Connect to DB and create table
    dh = db.DbHandler(args.db_file_name)
Пример #7
0
def get_top_entities_list(ndb, tag):
    print "get_top_entitites_list-get_top_entities_list"
    # Connect to DB and create table
    dh = db.DbHandler(ndb)
    # begin_event_date = dh.get_info_dic()['event_begin_date']
    # begin_event_date = dt.str2date(begin_event_date)
    begin_event_date = BEGIN_DATE
    end_event_date = dh.get_info_dic()['end_date']
    end_event_date = dt.str2date(end_event_date)
    # begin_date = dh.get_info_dic()['begin_date']
    # begin_date = dt.str2date(begin_date)
    end_date = dh.get_info_dic()['end_date']
    end_date = dt.str2date(end_date)

    get_data_func = (lambda b, e: dh.get_ent(b, e, tag, 1000000000))

    dout = ["period"]
    for i in range(0, 10):
        dout.append("Name")
        dout.append("#")
    dout.append("one_to_before_one_cosine_sim")
    dout.append("one_to_all_cosine_sim")
    dout.append("one_to_all_before_cosine_sim")
    dout.append("one_to_all_future_cosine_sim")
    dout.append("one_to_first_three_mounth_sim")
    dout.append("one_to_peak_three_mounth_sim")
    dout.append("one_to_duration_sim")

    dout.append("entropy")
    dout.append("normalized_" + dout[-1])

    dout = [dout]
    all_dict = get_all_dict(begin_event_date, end_event_date, get_data_func)
    future_dict = dict()
    past_dict = dict()
    before_dict = dict()
    first_dict = dict()
    peak_dict = dict()
    dur_dict = dict()

    # make all_dict
    print "make all_dict"
    b = begin_event_date
    e = end_date
    peak_mention = get_entity_count.peak_period_mention(ndb)
    # peak_begin = 0
    # peak_end = 0
    dur = duration_event.get_duration(ndb)

    # print "get_top_entities_list first loop"
    while b < e:
        # print "get_top_entities_list first loop", b
        p = get_data_func(b, b + threemonth)
        ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))),
                    key=lambda x: x[1],
                    reverse=True)
        if begin_event_date <= b and begin_event_date < b + threemonth:
            for d in ds:
                if d[0] in first_dict.keys():
                    first_dict[d[0]] += d[1]
                else:
                    first_dict[d[0]] = d[1]
        if b < peak_mention and peak_mention <= b + threemonth:
            for d in ds:
                peak_dict[d[0]] = d[1]
        if dur[0] <= b and b + threemonth <= dur[1]:
            for d in ds:
                if d[0] in dur_dict.keys():
                    dur_dict[d[0]] += d[1]
                else:
                    dur_dict[d[0]] = d[1]
        b += threemonth
    future_dict = copy.deepcopy(all_dict)

    b = begin_event_date
    e = end_date
    print "get_top_entities_list second loop"
    while b < e:
        # print "get_top_entities_list second loop", b
        do = [dt.date2str(b)]
        p = get_data_func(b, b + threemonth)
        ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))),
                    key=lambda x: x[1],
                    reverse=True)
        now_dict = dict()
        for d in ds:
            now_dict[d[0]] = d[1]
        for d in ds[0:10]:
            do.append(d[0])
            do.append(d[1])
        for i in range(0, 10 - len(ds[0:10])):
            do.append("")
            do.append("")
        future_dict = sub_dict(future_dict, now_dict)

        do.append(cosine_sim(now_dict, before_dict))
        do.append(cosine_sim(now_dict, all_dict))
        do.append(cosine_sim(now_dict, past_dict))
        do.append(cosine_sim(now_dict, future_dict))
        do.append(cosine_sim(now_dict, first_dict))
        do.append(cosine_sim(now_dict, peak_dict))
        do.append(cosine_sim(now_dict, dur_dict))

        do.append(entropy(now_dict))
        do.append(0)

        dout.append(do)

        before_dict = now_dict
        past_dict = add_dict(past_dict, now_dict)

        b += threemonth

    # Calculate normalized_entropy
    maxv = 0
    for i in range(1, len(dout)):
        maxv = max(maxv, dout[i][column_entropy])
    if maxv != 0:
        for i in range(1, len(dout)):
            dout[i][column_entropy + 1] = dout[i][column_entropy] / float(maxv)

    return dout
Пример #8
0
    def make_out_data(get_data_func):
        ti = tf_idf.TF_IDF(get_data_func, begin_event_date, end_date)

        threemonth = datetime.timedelta(days=92)
        dout = ["period"]

        dout.append("tf_idf_one_to_before_one_cosine_sim")
        dout.append("tf_idf_one_to_all_cosine_sim")
        dout.append("tf_idf_one_to_all_before_cosine_sim")
        dout.append("tf_idf_one_to_all_future_cosine_sim")
        dout.append("tf_idf_one_to_first_three_mounth_sim")
        dout.append("tf_idf_one_to_peak_three_mounth_sim")
        dout.append("tf_idf_one_to_duration_sim")

        dout = [dout]

        b = begin_event_date
        e = end_date
        peak_mention = get_entity_count.peak_period_mention(ndb)
        peak_begin = 0
        peak_end = 0
        while b < e:
            if b <= peak_mention and peak_mention <= b + threemonth:
                peak_begin = b
                peak_end = b + threemonth
            b += threemonth

        b = begin_event_date
        e = end_date
        dur = duration_event.get_duration(ndb)
        while b < e:
            print "tf_idf_sheet-second-loop", b
            do = [dt.date2str(b)]

            now_tiv = ti.get_tf_idf_vector(b, b + threemonth)

            # tf_idf_one_to_before_one_cosine_sim
            if not b == begin_event_date:
                do.append(cosine_sim(
                    now_tiv, ti.get_tf_idf_vector(b - threemonth, b)))
            else:
                do.append(0)

            # tf_idf_one_to_all_cosine_sim
            do.append(cosine_sim(now_tiv, ti.tf_idf_all))

            # tf_idf_one_to_all_before_cosine_sim
            c = cosine_sim(
                now_tiv, ti.get_tf_idf_vector(begin_event_date, b))
            print "tf_idf_one_to_all_before_cosine_sim = ", c, begin_event_date, b
            print "now_tiv = "
            print tf_idf.show_tf_idf_dict(now_tiv)
            print "ti.get_tf_idf_vector(begin_event_date, b) = "
            print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b))
            print c
            if not b == begin_event_date:
                do.append(cosine_sim(
                    now_tiv, ti.get_tf_idf_vector(begin_event_date, b)))
            else:
                do.append(0)

            # tf_idf_one_to_all_future_cosine_sim
            c = cosine_sim(
                now_tiv, ti.get_tf_idf_vector(b + threemonth, e))
            print "tf_idf_one_to_all_future_cosine_sim = ", c, b + threemonth, e
            print "now_tiv = "
            print tf_idf.show_tf_idf_dict(now_tiv)
            print "ti.get_tf_idf_vector(b + threemonth, e) = "
            print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b))
            print c
            # print c
            do.append(cosine_sim(
                now_tiv, ti.get_tf_idf_vector(b + threemonth, e)))

            # tf_idf_one_to_first_three_mounth_sim
            do.append(cosine_sim(now_tiv, ti.get_tf_idf_vector(
                begin_event_date, begin_event_date + threemonth)))

            # tf_idf_one_to_peak_three_mounth_sim
            do.append(cosine_sim(
                now_tiv, ti.get_tf_idf_vector(peak_begin, peak_end)))

            # tf_idf_one_to_duration_sim
            do.append(cosine_sim(
                now_tiv, ti.get_tf_idf_vector(dur[0], dur[1])))

            dout.append(do)

            b += threemonth

        return dout
Пример #9
0
    ap = argparse.ArgumentParser(description='Get event info.')

    ap.add_argument('db_file_name', help='SQLite3 DB file name')

    ap.add_argument('event_name', help='event name')

    ap.add_argument('event_begin_date',
                    type=dt.str2date,
                    help='event begin date')

    ap.add_argument('event_end_date', type=dt.str2date, help='event end date')

    args = ap.parse_args()
    print 'DB: {0}'.format(args.db_file_name)
    print 'event name: {0}'.format(args.event_name)
    print 'event begin date: {0}'.format(dt.date2str(args.event_begin_date))
    print 'event   end date: {0}'.format(dt.date2str(args.event_end_date))
    print '-' * 40

    # Connect to DB and create table
    dcon = db.DbHandler(args.db_file_name)

    threemonth = datetime.timedelta(days=92)
    # b = args.event_begin_date
    b = BEGIN_DATE
    e = args.event_end_date
    output = [[
        "period", "#mention", "#person(all)", "#person(new)",
        "#person(unique)", "#location(all)", "#location(new)",
        "#location(unique)", "#organization(all)", "#organization(new)",
        "#organization(unique)"
Пример #10
0
 def __join_text_date(self, text, date):
     '''
     Join text and date.
     '''
     date_s = dt.date2str(date)
     return '[<date>{0}</date>]\n{1}'.format(date_s, text)
Пример #11
0
def set_peak(ndb):
    peak_mention = get_entity_count.peak_period_mention(ndb)
    dh = db.DbHandler(ndb)
    dh.insert_info('peak_mention', dt.date2str(peak_mention))