Пример #1
0
all_anno = dict()
for i, pair in enumerate(g_n()):
    n, anno = pair
    all_anno[n] = anno
    print('Loading ({0}/{1}) : {2}\r'.format(i + 1, oracle_nc, n), end='')
    sys.stdout.flush()
print('\nAnnotations loaded')

#~ for a in all_anno.values():
#~ print(len(a.units))
#~ print(len(set(u.id for u in a.units if u.type=='Commitment')))

#~ sys.exit()

### Train on data !
c_t = cs.TabData(ffinal)
c_t.new_class('is_commitment')
#~ end_c = cs.Trainer(c_t, 10, 'dialogue', learner='logreg')
end_c = cs.Trainer(c_t, 10, 'dialogue')
end_c.evaluate()

### Create reports...
fnl, fpl = [], []
for pred, row in end_c.pred_rows():
    # False negative case
    if pred.value == 'False' and row.getclass().value == 'True':
        fnl.append(row)
    # False positive case
    if pred.value == 'True' and row.getclass().value == 'False':
        fpl.append(row)
Пример #2
0
all_anno = dict()
for i, pair in enumerate(g_n()):
    n, anno = pair
    all_anno[n] = anno
    print('Loading ({0}/{1}) : {2}\r'.format(i + 1, oracle_nc, n), end='')
    sys.stdout.flush()
print('\nAnnotations loaded')


def nsplit(id):
    l = id.split('_')
    return tuple('_'.join(pl) for pl in (l[:2], l[-2:]))


ques = defaultdict(list)
x_t = cs.TabData(fxqap)
for row in x_t:
    gi, si = nsplit(row['q_id'].value)
    if gi in all_anno:
        ques[nsplit(row['a_id'].value)].append((gi, si))

#~ print(ques)
#~ sys.exit()
c_t = cs.TabData(ffinal)
c_t.new_class('is_commitment')
#~ end_c = cs.Trainer(c_t, 10, 'dialogue', learner='logreg')
end_c = cs.Trainer(c_t, 10, 'dialogue')
end_c.evaluate()

tot_ok = 0
tot_in = 0
Пример #3
0
                        break

    # Without counter
    #~ all_anno = dict(g_n())

    # With counter
    all_anno = dict()
    for i, pair in enumerate(g_n()):
        n, anno = pair
        all_anno[n] = anno
        print('Loading ({0}/{1}) : {2}\r'.format(i + 1, oracle_nc, n), end='')
        sys.stdout.flush()
    print('\nAnnotations loaded')

    #~ e_t = cs.TabData(fsing)
    e_t = cs.TabData(fnsing)

    def gsname(row):
        """ Returns game section name from id """
        return '_'.join(row['id'].value.split('_', 2)[:2])

    snames = set(map(gsname, e_t))
    # Only keep rows with Commitment annotations
    e_t.sel_row_by(lambda r: gsname(r) in all_anno)

    ######### Stat
    #~ count = defaultdict(lambda:0)
    #~ for n, a in all_anno.items():
    #~ if n not in snames:
    #~ continue
    #~ print(n)
Пример #4
0
# Quicker script with already-built data !

from __future__ import print_function
import sys
import os
import annodata as ad
import classify as cs
from collections import defaultdict

fcomm = '/home/arthur/These/Data/socl-season1.custom-edus.tab'
fmerge = '/home/arthur/These/Data/socl-season1.merged.tab'
fturns = '/home/arthur/These/Data/socl-season1.turns2.tab'
fqap = '/home/arthur/These/Data/socl-season1.qap.tab'
ffinal = '/home/arthur/These/Data/socl-season1.final.tab'

c_t = cs.TabData(fmerge)
pc_t = cs.TabData(fqap)
c_t.merge(pc_t)
c_t.fuse_rows('turn_id')
c_t.save(ffinal)

print(len(set(row['is_commitment'].value == 'True' for row in c_t)))
#~ sys.exit()

#~ ddd = defaultdict(list)
#~ for row in c_t:
#~ ddd[row['dialogue'].value].append(row['turn_id'].value)
#~ count = defaultdict(int)
#~ for k,v in ddd.items():
#~ count[len(v)] += 1
#~
Пример #5
0
    'position_in_dialogue_DU1', 'position_in_game_DU1',
    'edu_position_in_turn_DU1', 'has_correction_star_DU1',
    'ends_with_bang_DU1', 'ends_with_qmark_DU1', 'has_FOR_np_DU1',
    'is_question_DU1', 'num_tokens_DU2', 'has_player_name_exact_DU2',
    'has_player_name_fuzzy_DU2', 'has_emoticons_DU2', 'is_emoticon_only_DU2',
    'speaker_started_the_dialogue_DU2',
    'speaker_already_spoken_in_dialogue_DU2',
    'speakers_first_turn_in_dialogue_DU2', 'turn_follows_gap_DU2',
    'position_in_dialogue_DU2', 'position_in_game_DU2',
    'edu_position_in_turn_DU2', 'has_correction_star_DU2',
    'ends_with_bang_DU2', 'ends_with_qmark_DU2', 'has_FOR_np_DU2',
    'is_question_DU2'
]
meta_sel = ['dialogue', 'id_DU1', 'id_DU2']

t_r = cs.TabData(fpairs)
#~ t_r.sel_row({'CLASS':'UNRELATED'}, negate=1)
t_r.sel_col(feat_sel, meta_sel, 'CLASS')
#~ t_r.save('res/cut.tab')

#~ c_r = cs.Trainer(t_r, grouper='dialogue')
c_r = cs.Trainer(t_r, learner='logreg', grouper='dialogue')

c_r.evaluate()
sys.exit()
with open('../res/gpred.tab', 'w') as f:
    for pred, row in c_r.pred_rows():
        line = '\t'.join([
            k.value
            for k in (pred, row.getclass(), row['id_DU1'], row['id_DU2'])
        ])
Пример #6
0
    'speaker_already_spoken_in_dialogue_DU2',
    'speakers_first_turn_in_dialogue_DU2', 'turn_follows_gap_DU2',
    'position_in_dialogue_DU2', 'position_in_game_DU2',
    'edu_position_in_turn_DU2', 'has_correction_star_DU2',
    'ends_with_bang_DU2', 'ends_with_qmark_DU2', 'lemma_subject_DU2',
    'has_FOR_np_DU2', 'is_question_DU2'
]
meta_sel = ['dialogue']

step_size = 10
if len(sys.argv) >= 2:
    step_size = int(sys.argv[1])

# Step 1 : master data table
if False:
    t_full = cs.TabData(fpairs)
    t_full.sel_col(feat_sel, meta_sel, 'CLASS')
    t_full.save(fsrc)

# Step 2 : set of all dialogues
t_master = cs.TabData(fsrc)
dials = list(set(l['dialogue'].value for l in t_master))
random.shuffle(dials)
print('Data loaded')

# Step 3 : the curve loop
all_scores = list()
n = len(dials)
n_steps = int(n / step_size)
for m in range(n_steps):
    t_size = step_size * (m + 1)
Пример #7
0
# Merging custom and attelo sources
# For great justice

# Python 2

import classify as cs

fcustom = '../res/custom.tab'
fmerge = '../res/merge.tab'
frel = '/home/arthur/These/Master/Stac/data/SNAPSHOTS/2014-06-04/socl-season1.relations.csv'

t_c, t_r = (cs.TabData(f) for f in (fcustom, frel))
t_c.newmerge(t_r, ('id_DU1', 'id_DU2'))
t_c.save(fmerge)