from ldp.gesture import LRB, GestureType from ldp.data import Utterances, Subjects from util.count import FeatureCounter lrb = LRB() gtype = GestureType() subjects = Subjects() utterances = Utterances() count = FeatureCounter('Subject', 'Session', 'Project', 'Gesture') P2 = set(subjects.project(2)) columns = 'subject, session, c_lrb, c_g_type' filter = 'session in (1,2,3,4,5,8) and c_lrb != ""' def pprint(args): print "\t".join(args) for subj, sess, h, g in utterances(columns, filter, limit=''): proj = 2 if subj in P2 else 3 H = lrb.valid_values(h.upper()) G = gtype.valid_values(g, subcodes=False) for (h, g) in zip(H, G): code = "{0}+{1}".format(h, g) count(subj, sess, proj, code) count.print_report('Gesture')
import re from nlp import Tokenizer from ldp.data import Utterances from util.count import FeatureCounter count = FeatureCounter('Subject', 'Session', 'Speaker', 'Word') utterances = Utterances() parse = Tokenizer() words = [word.rstrip() for word in open('words.txt')] rgx = re.compile(r'\b(?:' + '|'.join(words) + r')\b') columns = 'subject, session, row, p_utts, c_utts' where = 'session < 8' # pretty-print with tab delims def pprint(*args): print '\t'.join(str(x) for x in args) pprint(*'SUBJ SESS SPKR ROW UTT MATCH'.split(' ')) # header for subj, sess, row, p, c in utterances(columns, where, project=2): for spkr, utt in [('P', p), ('C', c)]: matches = rgx.findall(utt) for word in matches: count(subj, sess, spkr, word) if matches: pprint(subj, sess, spkr, row, utt, ', '.join(matches)) print count.print_report('Word')
import re from ldp.data import Utterances from nlp.lex import Tokenizer from util.count import FeatureCounter utts = Utterances() tokenize = Tokenizer() count = FeatureCounter('Subject', 'Session', 'Speaker') columns = 'subject, session, key, c_utts, p_utts' wordchar = re.compile(r'\w') grandmother = re.compile(r'G') father = re.compile(r'F|@') for subj, sess, key, c, p in utts(columns, where='session in ("11", "12")', limit='', project=2): for spkr, utt in [('CHILD', c), ('MOTHER', p)]: if spkr == 'MOTHER': if father.search(key): spkr = 'FATHER' elif grandmother.search(key): spkr = 'GRANDMOTHER' for t in tokenize(utt): if wordchar.search(t): count(subj, sess, spkr) count.print_report('Speaker')
from ldp.gesture import LRB from ldp.data import Utterances, Subjects from util.count import FeatureCounter lrb = LRB() subjects = Subjects() utterances = Utterances() count = FeatureCounter('Subject', 'Session', 'Project', 'LRB') P2 = set(subjects.project(2)) columns = 'subject, session, c_lrb, c_g_type' filter = 'session in (1,2,3,4,5,8) and c_lrb != ""' def pprint(args): print "\t".join(args) for subj, sess, h, g in utterances(columns, filter, limit=''): proj = 2 if subj in P2 else 3 for h in lrb.valid_values(h.upper()): count(subj, sess, proj, h) count.print_report('LRB')