예제 #1
0
def get_duration(ndb):
    dh = db.DbHandler(ndb)
    event_name = dh.get_info_dic()["event_name"]
    r = get_begin_date_wikidata(event_name)
    senti = (datetime.date(1900, 1, 1), datetime.date(1900, 1, 1) + threemonth)
    dur = senti
    if not r == datetime.date(1900, 1, 1):
        dur = (r, r + threemonth)
    b = dh.get_info_dic()['begin_date']
    b = dt.str2date(b)
    e = dh.get_info_dic()['end_date']
    e = dt.str2date(e)
    if dur == senti:
        while b < e:
            if len(dh.get_ent_all_tag(b, b + threemonth, 100000)) > 0:
                dur = (b, b + threemonth)
                break
            b += threemonth
    if dur == senti:
        dur = (datetime.date(1987, 1,
                             1), datetime.date(1987, 1, 1) + threemonth)
    dh.insert_info('event_begin_date', dt.date2str(dur[0]))
    # print dt.date2str(dur[0])
    # begin_event_date = dh.get_info_dic()['event_begin_date']
    # print begin_event_date
    dh.insert_info('event_duration',
                   dt.date2str(dur[0]) + '-' + dt.date2str(dur[1]))
    print "duration of ", ndb, " is ", dur[0], "-", dur[1]
    return dur
예제 #2
0
def get_granurality_list(ndb):
    # Connect to DB and create table
    dh = db.DbHandler(ndb)
    begin_event_date = dh.get_info_dic()['event_begin_date']
    begin_event_date = dt.str2date(begin_event_date)
    begin_date = dh.get_info_dic()['begin_date']
    begin_date = dt.str2date(begin_date)
    end_date = dh.get_info_dic()['end_date']
    end_date = dt.str2date(end_date)

    def make_out_data(get_data_func):
        threemonth = datetime.timedelta(days=92)
        dout = ["period"]
        dout.append("xxxx-xx-xx")
        dout.append("xxxx-xx")
        dout.append("xxxx")
        dout.append("PRESENT_REF")
        dout.append('xxxx-SU/SP/FA/WN')
        dout.append("PAST_REF")
        for i in range(-6, 0):
            dout.append("normalized_" + dout[-i])
        dout = [dout]

        # make all_dict
        b = begin_date
        e = end_date
        while b < e:
            p = get_data_func(b, b + threemonth)
            p = map(lambda x: x[0], p)
            d = len(filter(is_date, p))
            m = len(filter(is_mounth, p))
            y = len(filter(is_year, p))
            n = len(filter(is_present, p))
            s = len(filter(is_season, p))
            o = len(filter(is_past, p))
            dout.append([dt.date2str(b), d, m, y, n, s, o, 0, 0, 0, 0, 0, 0])

            b += threemonth
        for i in range(-6, 0):
            maxv = 0
            for j in range(1, len(dout)):
                maxv = max(maxv, dout[j][i - 6])
            if maxv != 0:
                for j in range(1, len(dout)):
                    dout[j][i] = dout[j][i - 6] / float(maxv)

        return dout

    return make_out_data(lambda b, e: dh.get_ent(b, e, 'HEIDEL_TIME', 1000000000))
예제 #3
0
def get_sheet(ndb, tag):
    print "get_top_entities_list.py-get_sheet-get_top_entities_list"
    d1 = get_top_entities_list(ndb, tag)
    # CATION
    # d1 = list()

    print "Connect to DB and create table"
    dh = db.DbHandler(ndb)
    # begin_event_date = dh.get_info_dic()['event_begin_date']
    # begin_event_date = dt.str2date(begin_event_date)
    begin_event_date = BEGIN_DATE
    end_event_date = dh.get_info_dic()['end_date']
    end_event_date = dt.str2date(end_event_date)
    # begin_date = dh.get_info_dic()['begin_date']
    # begin_date = dt.str2date(begin_date)
    # end_date = dh.get_info_dic()['end_date']
    # end_date = dt.str2date(end_date)

    get_data_func = (lambda b, e: dh.get_ent(b, e, tag, 1000000000))

    print "get_sheet-changing_freqeuency_word"
    d2 = changing_freqeuency_word(begin_event_date, end_event_date,
                                  get_data_func)
    for d in d2:
        d1.append(d)

    d3 = score_word(begin_event_date, end_event_date, get_data_func)
    for d in d3:
        d1.append(d)

    return d1
예제 #4
0
def summarize_tf_idf_sheet(list_data):
    data0 = list_data[0][0]
    H = len(data0)
    W = len(data0[0])

    nonzero_starts = []
    for (data, dur) in list_data:
        for i in range(1, H):
            if dur[0] <= dt.str2date(data0[i][0]):
                nonzero_starts.append(i)
                break

    retdata = []
    for i in range(H):
        l = []
        for j in range(W):
            if i == 0 or j == 0:
                l.append(list_data[0][i][j])
            else:
                a = 0
                n = 0
                for (k, (d, dur)) in enumerate(list_data):
                    if i + nonzero_starts[k] < H:
                        a += d[i][j]
                        n = n + 1
                a /= float(n)
                l.append(a)
        retdata.append(l)
    return retdata
예제 #5
0
def summarize_article_for_each_query(list_data):
    periods = []
    naverages = []
    list_narts = []
    nonzero_starts = []
    for i in range(1, len(list_data[0][0][0])):
        periods.append(list_data[0][0][0][i])
    for (data, dur) in list_data:
        list_narts.append([])
        nonzero = len(data[0]) - 2
        s = 0

        for i in range(1, len(data[0])):
            na = 0
            for j in range(1, len(data)):
                na += data[j][i]
            a = data[0][i].split("_")[0]
            b = dur[0]
            if type(b) == datetime.datetime:
                b = b.date()
            if b <= dt.str2date(a):
                nonzero = min(i, nonzero)
            list_narts[-1].append(na)
            s = s + na

        # This is a process of normalization
        for i in range(len(list_narts[-1])):
            list_narts[-1][i] = float(list_narts[-1][i]) / float(s)
        nonzero_starts.append(nonzero)
    # print nonzero_starts

    for i in range(len(list_narts[0])):
        s = 0
        n = 0
        for j in range(len(list_narts)):
            # maneuver on index is shifting
            if nonzero_starts[j] + i < len(list_narts[j]):
                s += list_narts[j][nonzero_starts[j] + i]
            n += 1
        if not n == 0:
            naverages.append(float(s) / float(n))
        else:
            naverages.append(0)
    retdata = []
    for (p, n) in zip(periods, naverages):
        retdata.append([p, n])

    # print periods
    # print naverages

    return retdata
예제 #6
0
def peak_period_mention(ndb):
    dcon = db.DbHandler(ndb)
    # b = dt.str2date(dcon.get_info_dic()["event_begin_date"])
    b = BEGIN_DATE
    e = dt.str2date(dcon.get_info_dic()["end_date"])
    threemonth = datetime.timedelta(days=92)
    mm = 0
    ret = b
    while b < e:
        m = dcon.get_ref_num(b, b + threemonth)
        if mm < m:
            mm = m
            ret = b
        b += threemonth
    return ret
예제 #7
0
def get_entity_count(fdb):
    # Connect to DB and create table
    dcon = db.DbHandler(fdb)
    # begin_event_date = dcon.get_info_dic()['event_begin_date']
    # begin_event_date = dt.str2date(begin_event_date)
    begin_event_date = BEGIN_DATE
    end_event_date = dcon.get_info_dic()['end_date']
    end_event_date = dt.str2date(end_event_date)

    threemonth = datetime.timedelta(days=92)
    b = begin_event_date
    e = end_event_date
    output = [[
        "period", "#mention", "#person(all)", "#person(new)",
        "#person(unique)", "#location(all)", "#location(new)",
        "#location(unique)", "#organization(all)", "#organization(new)",
        "#organization(unique)"
    ]]
    ps = set()
    ls = set()
    os = set()
    while b < e:
        d = dt.date2str(b)
        m = dcon.get_ref_num(b, b + threemonth)
        p = dcon.get_ent(b, b + threemonth, "PERSON", 100000000)
        l = dcon.get_ent(b, b + threemonth, "LOCATION", 100000000)
        o = dcon.get_ent(b, b + threemonth, "ORGANIZATION", 100000000)
        nps = ps.union(set(p))
        nls = ls.union(set(l))
        nos = os.union(set(o))
        output.append([
            d, m,
            len(p),
            len(nps) - len(ps),
            len(set(p)),
            len(l),
            len(nls) - len(ls),
            len(set(l)),
            len(o),
            len(nos) - len(os),
            len(set(o))
        ])
        ps = nps
        ls = nls
        os = nos
        b += threemonth
    return output
예제 #8
0
def summarize_entity_count(list_data):
    retdata = copy.deepcopy(list_data[0][0])
    # print retdata
    for i in range(1, len(retdata)):
        for j in range(1, len(retdata[i])):
            retdata[i][j] = 0
    nonzero_starts = [-1 for d in list_data]

    # print list_data[0][1]
    # print list_data[1][1]

    for (k, (d, dur)) in enumerate(list_data):
        for i in range(1, len(d)):
            a = dur[0]
            if type(a) == datetime.datetime:
                a = a.date()
                # print a, d[i][0]
            if nonzero_starts[k] == -1 and a <= dt.str2date(d[i][0]):
                nonzero_starts[k] = i

        # This is the process of normalization
        for j in range(1, len(d[i])):
            s = 0.0
            for i in range(1, len(d)):
                s += float(d[i][j])
            for i in range(1, len(d)):
                if not s == 0.0:
                    d[i][j] = float(d[i][j]) / float(s)

    for i in range(1, len(retdata)):
        for j in range(1, len(retdata[i])):
            s = 0
            n = 0
            for (k, (d, dur)) in enumerate(list_data):
                if nonzero_starts[k] + i < len(retdata):
                    # print i, nonzero_starts[k] + j
                    # maneuver on index is shifting
                    s += float(d[nonzero_starts[k] + i][j])
                    n += 1
            # print s
            if n != 0:
                retdata[i][j] = float(s) / float(n)

    # print retdata
    return retdata
예제 #9
0
def summarize_top_enetities_list(list_data):
    W = len(list_data[0][0][0])
    H = len(list_data[0][0])
    data0 = list_data[0][0]

    retdata = list()
    l = []
    for j in [0] + range(21, W):
        l.append(data0[0][j])
    retdata.append(l)

    nonzero_starts = []
    for (data, dur) in list_data:
        for i in range(1, H):
            if not data0[i][0].replace("-", "").isdigit():
                break
            # print data[0]
            if dur[0] <= dt.str2date(data[i][0]):
                nonzero_starts.append(i)
                break

    for i in range(1, H):
        if not data0[i][0].replace("-", "").isdigit():
            H = i
            break

    for i in range(0, H):
        l = [data0[i][0]]
        for j in range(21, W):
            a = float(0)
            n = 0
            for (k, (d, dur)) in enumerate(list_data):
                if i + nonzero_starts[k] < H:
                    a += float(d[i + nonzero_starts[k]][j])
                    n = n + 1
            if n != 0:
                a /= float(n)
            l.append(a)
        retdata.append(l)
    return retdata
예제 #10
0
def summarize_entity_count(list_data):
    retdata = copy.deepcopy(list_data[0][0])
    for i in range(1, len(retdata)):
        for j in range(1, len(retdata[i])):
            retdata[i][j] = 0
    nonzero_starts = [-1 for d in list_data]

    # print list_data[0][1]
    # print list_data[1][1]

    for (k, (d, dur)) in enumerate(list_data):
        for i in range(1, len(d)):
            if dur[0] <= dt.str2date(d[i][0]):
                nonzero_starts[k] = i

        # This is the process of normalization
        for j in range(1, len(d[i])):
            s = 0.0
            for i in range(1, len(d)):
                s += float(d[i][j])
            for i in range(1, len(d)):
                d[0][i][j] = float(d[i][j]) / s

    for i in range(1, len(retdata)):
        for j in range(0, len(retdata[i]) - 1):
            s = 0
            n = 0
            for (k, (d, dur)) in enumerate(list_data):
                if nonzero_starts[k] + j < len(retdata[i]):
                    # maneuver on index is shifting
                    s += int(d[i][nonzero_starts[k] + j])
                    n += 1
            if n != 0:
                retdata[i][j] = float(s) / float(n)

    return retdata
예제 #11
0
def get_top_entities_list(ndb, tag):
    print "get_top_entitites_list-get_top_entities_list"
    # Connect to DB and create table
    dh = db.DbHandler(ndb)
    # begin_event_date = dh.get_info_dic()['event_begin_date']
    # begin_event_date = dt.str2date(begin_event_date)
    begin_event_date = BEGIN_DATE
    end_event_date = dh.get_info_dic()['end_date']
    end_event_date = dt.str2date(end_event_date)
    # begin_date = dh.get_info_dic()['begin_date']
    # begin_date = dt.str2date(begin_date)
    end_date = dh.get_info_dic()['end_date']
    end_date = dt.str2date(end_date)

    get_data_func = (lambda b, e: dh.get_ent(b, e, tag, 1000000000))

    dout = ["period"]
    for i in range(0, 10):
        dout.append("Name")
        dout.append("#")
    dout.append("one_to_before_one_cosine_sim")
    dout.append("one_to_all_cosine_sim")
    dout.append("one_to_all_before_cosine_sim")
    dout.append("one_to_all_future_cosine_sim")
    dout.append("one_to_first_three_mounth_sim")
    dout.append("one_to_peak_three_mounth_sim")
    dout.append("one_to_duration_sim")

    dout.append("entropy")
    dout.append("normalized_" + dout[-1])

    dout = [dout]
    all_dict = get_all_dict(begin_event_date, end_event_date, get_data_func)
    future_dict = dict()
    past_dict = dict()
    before_dict = dict()
    first_dict = dict()
    peak_dict = dict()
    dur_dict = dict()

    # make all_dict
    print "make all_dict"
    b = begin_event_date
    e = end_date
    peak_mention = get_entity_count.peak_period_mention(ndb)
    # peak_begin = 0
    # peak_end = 0
    dur = duration_event.get_duration(ndb)

    # print "get_top_entities_list first loop"
    while b < e:
        # print "get_top_entities_list first loop", b
        p = get_data_func(b, b + threemonth)
        ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))),
                    key=lambda x: x[1],
                    reverse=True)
        if begin_event_date <= b and begin_event_date < b + threemonth:
            for d in ds:
                if d[0] in first_dict.keys():
                    first_dict[d[0]] += d[1]
                else:
                    first_dict[d[0]] = d[1]
        if b < peak_mention and peak_mention <= b + threemonth:
            for d in ds:
                peak_dict[d[0]] = d[1]
        if dur[0] <= b and b + threemonth <= dur[1]:
            for d in ds:
                if d[0] in dur_dict.keys():
                    dur_dict[d[0]] += d[1]
                else:
                    dur_dict[d[0]] = d[1]
        b += threemonth
    future_dict = copy.deepcopy(all_dict)

    b = begin_event_date
    e = end_date
    print "get_top_entities_list second loop"
    while b < e:
        # print "get_top_entities_list second loop", b
        do = [dt.date2str(b)]
        p = get_data_func(b, b + threemonth)
        ds = sorted(map(lambda w: (w[0], p.count(w)), list(set(p))),
                    key=lambda x: x[1],
                    reverse=True)
        now_dict = dict()
        for d in ds:
            now_dict[d[0]] = d[1]
        for d in ds[0:10]:
            do.append(d[0])
            do.append(d[1])
        for i in range(0, 10 - len(ds[0:10])):
            do.append("")
            do.append("")
        future_dict = sub_dict(future_dict, now_dict)

        do.append(cosine_sim(now_dict, before_dict))
        do.append(cosine_sim(now_dict, all_dict))
        do.append(cosine_sim(now_dict, past_dict))
        do.append(cosine_sim(now_dict, future_dict))
        do.append(cosine_sim(now_dict, first_dict))
        do.append(cosine_sim(now_dict, peak_dict))
        do.append(cosine_sim(now_dict, dur_dict))

        do.append(entropy(now_dict))
        do.append(0)

        dout.append(do)

        before_dict = now_dict
        past_dict = add_dict(past_dict, now_dict)

        b += threemonth

    # Calculate normalized_entropy
    maxv = 0
    for i in range(1, len(dout)):
        maxv = max(maxv, dout[i][column_entropy])
    if maxv != 0:
        for i in range(1, len(dout)):
            dout[i][column_entropy + 1] = dout[i][column_entropy] / float(maxv)

    return dout
예제 #12
0
def make_tf_idf_sheet(ndb):
    # Connect to DB and create table
    dh = db.DbHandler(ndb)
    # begin_event_date = dh.get_info_dic()['event_begin_date']
    # begin_event_date = dt.str2date(begin_event_date)
    begin_event_date = BEGIN_DATE
    end_date = dh.get_info_dic()['end_date']
    end_date = dt.str2date(end_date)

    def make_out_data(get_data_func):
        ti = tf_idf.TF_IDF(get_data_func, begin_event_date, end_date)

        threemonth = datetime.timedelta(days=92)
        dout = ["period"]

        dout.append("tf_idf_one_to_before_one_cosine_sim")
        dout.append("tf_idf_one_to_all_cosine_sim")
        dout.append("tf_idf_one_to_all_before_cosine_sim")
        dout.append("tf_idf_one_to_all_future_cosine_sim")
        dout.append("tf_idf_one_to_first_three_mounth_sim")
        dout.append("tf_idf_one_to_peak_three_mounth_sim")
        dout.append("tf_idf_one_to_duration_sim")

        dout = [dout]

        b = begin_event_date
        e = end_date
        peak_mention = get_entity_count.peak_period_mention(ndb)
        peak_begin = 0
        peak_end = 0
        while b < e:
            if b <= peak_mention and peak_mention <= b + threemonth:
                peak_begin = b
                peak_end = b + threemonth
            b += threemonth

        b = begin_event_date
        e = end_date
        dur = duration_event.get_duration(ndb)
        while b < e:
            print "tf_idf_sheet-second-loop", b
            do = [dt.date2str(b)]

            now_tiv = ti.get_tf_idf_vector(b, b + threemonth)

            # tf_idf_one_to_before_one_cosine_sim
            if not b == begin_event_date:
                do.append(cosine_sim(
                    now_tiv, ti.get_tf_idf_vector(b - threemonth, b)))
            else:
                do.append(0)

            # tf_idf_one_to_all_cosine_sim
            do.append(cosine_sim(now_tiv, ti.tf_idf_all))

            # tf_idf_one_to_all_before_cosine_sim
            c = cosine_sim(
                now_tiv, ti.get_tf_idf_vector(begin_event_date, b))
            print "tf_idf_one_to_all_before_cosine_sim = ", c, begin_event_date, b
            print "now_tiv = "
            print tf_idf.show_tf_idf_dict(now_tiv)
            print "ti.get_tf_idf_vector(begin_event_date, b) = "
            print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b))
            print c
            if not b == begin_event_date:
                do.append(cosine_sim(
                    now_tiv, ti.get_tf_idf_vector(begin_event_date, b)))
            else:
                do.append(0)

            # tf_idf_one_to_all_future_cosine_sim
            c = cosine_sim(
                now_tiv, ti.get_tf_idf_vector(b + threemonth, e))
            print "tf_idf_one_to_all_future_cosine_sim = ", c, b + threemonth, e
            print "now_tiv = "
            print tf_idf.show_tf_idf_dict(now_tiv)
            print "ti.get_tf_idf_vector(b + threemonth, e) = "
            print tf_idf.show_tf_idf_dict(ti.get_tf_idf_vector(begin_event_date, b))
            print c
            # print c
            do.append(cosine_sim(
                now_tiv, ti.get_tf_idf_vector(b + threemonth, e)))

            # tf_idf_one_to_first_three_mounth_sim
            do.append(cosine_sim(now_tiv, ti.get_tf_idf_vector(
                begin_event_date, begin_event_date + threemonth)))

            # tf_idf_one_to_peak_three_mounth_sim
            do.append(cosine_sim(
                now_tiv, ti.get_tf_idf_vector(peak_begin, peak_end)))

            # tf_idf_one_to_duration_sim
            do.append(cosine_sim(
                now_tiv, ti.get_tf_idf_vector(dur[0], dur[1])))

            dout.append(do)

            b += threemonth

        return dout

    data = make_out_data(lambda b, e: dh.get_ent_all_tag(b, e, 1000000000))
    return data
예제 #13
0
                    '--lim-word',
                    type=int,
                    default=LIM_WORD,
                    help='# of top words')
    ap.add_argument('-s',
                    '--lim-hscore',
                    type=int,
                    default=LIM_HSCORE,
                    help='# of high score mentions')
    ap.add_argument('-e', '--top-word', help='create "top N X sheet"')
    args = ap.parse_args()

    # Connect to DB and create table
    dh = db.DbHandler(args.db_file_name)
    event_name = dh.get_info_dic()["event_name"].replace(" ", "")
    event_begin_date = dt.str2date(dh.get_info_dic()["event_begin_date"])
    # end_date = dt.str2date(dh.get_info_dic()["end_date"])

    print 'DB: {0}'.format(args.db_file_name)
    if not args.xlsx_file_name:
        args.xlsx_file_name = args.db_file_name.split('.')[0] + '.xlsx'
    print 'Excel workbook: {0}'.format(args.xlsx_file_name)
    print '# of months / period: {0}'.format(args.month_delta)
    print '# of top word: {0}'.format(args.lim_word)
    print '# of high score mentions: {0}'.format(args.lim_hscore)
    print '-' * 40

    # Create DB handler and xlsx workbook
    xh = xlsx.XlsxHandler(args.db_file_name, args.xlsx_file_name,
                          args.month_delta)
예제 #14
0
    ap.add_argument('-u', '--solr-url', default=KAREN, help='Solr URL')

    args = ap.parse_args()
    print 'DB: {0}'.format(args.db_file_name)
    print 'Solr URL: {0}'.format(args.solr_url)
    print '-' * 40

    # Connect to DB file
    dh = db.DbHandler(args.db_file_name)

    # Get info
    print 'Getting info'
    info_dic = dh.get_info_dic()
    bdate_s = info_dic['begin_date']
    edate_s = info_dic['end_date']
    bdate = dt.str2date(bdate_s)
    edate = dt.str2date(edate_s)
    ev_bdate_s = info_dic['event_begin_date']
    ev_edate_s = info_dic['event_end_date']
    ev_bdate = dt.str2date(ev_bdate_s)
    ev_edate = dt.str2date(ev_edate_s)
    print 'article begin date: {0}'.format(bdate_s)
    print 'article   end date: {0}'.format(edate_s)
    print 'event   begin date: {0}'.format(ev_bdate_s)
    print 'event     end date: {0}'.format(ev_bdate_s)
    print '-' * 40

    # Create Solr NITF handler
    sh = slr.SolrNitfHandler(args.solr_url, bdate, edate)

    # Create tables
예제 #15
0
        # print p
        words = list()
        words = zip(p["ner"], p["tokens"], p["ner"])
        stop = stopwords.words("english")
        words = filter(lambda x: x[1] not in stop, words)
        words = map(lambda x: (x[0], x[1].lower(), x[2]), words)
        # I cannot understand what is most suitable in above line.
        ws = list()
        w = ("", "", "")
        for v in words:
            if v[0] != 'O' and v[0] == w[0]:
                w = (w[0], w[1] + " " + v[1], w[2])
            else:
                ws.append(w)
                w = v
        if w[0] != "":
            ws.append(w)
        words = ws

        return words[1:]


if __name__ == '__main__':
    kch = kawata_corenlp_handler()
    print 'kch = kawata_corenlp_handler()'
    p = kch.get_words(u"I lived in New York in 2016",
                      dt.str2date(u"2016-02-12"))
    p = kch.get_words(u"I lived in New York in 206/09/18",
                      dt.str2date(u"2016-02-12"))
    print p