def get_language_dicts_quals_turkers(path, qual_cutoff=None, strict=False, filter_list=None, match=True):
	all_dicts = dict()
	tmap = turker_map()
	hitmap = hit_map()
	langids, langcodes = dat.lang_map()
        hitlangs = dat.hits_language()
	quals = qual_map()
	tquals = turker_qual_map()
	for assign in open(path).readlines():
		comps = assign.strip().split('\t')
		aid = comps[0]
		if(aid == '' or aid not in tmap):
			continue
		#if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip
		if(match):
			list_pass = not(filter_list == None) and (aid not in filter_list)
		else:
			list_pass = not(filter_list == None) and (aid in filter_list)
		if(list_pass):
			continue
		#if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff
		if(strict):
			qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)
		else:
			qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)
		if(qual_pass):
			print 'below cutoff', aid, quals[aid]
			continue
		alang =  langids[hitlangs[hitmap[aid]]]
		if not(alang in all_dicts):
			all_dicts[alang] = {'num':0, 'denom':0}
		all_dicts[alang]['num'] += tquals[tmap[aid]]
		all_dicts[alang]['denom'] += 1
	return all_dicts
def get_language_dicts_quals(path,
                             qual_cutoff=None,
                             strict=False,
                             filter_list=None):
    all_dicts = dict()
    hitmap = hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    quals = qual_map()
    for assign in open(path).readlines():
        comps = assign.strip().split('\t')
        aid = comps[0]
        if (aid == ''):
            continue
        if (not (filter_list == None) and (aid not in filter_list)):
            continue
        if (not (qual_cutoff == None) and strict
                and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)):
            print 'below cutoff', aid, quals[aid]
            continue
        if (not (qual_cutoff == None) and not (strict)
                and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)):
            print 'below cutoff', aid, quals[aid]
            continue
        alang = langids[hitlangs[hitmap[aid]]]
        if not (alang in all_dicts):
            all_dicts[alang] = {'num': 0, 'denom': 0}
        all_dicts[alang]['num'] += quals[aid]
        all_dicts[alang]['denom'] += 1
    return all_dicts
def read_valid_clpairs():
    codes, words = dat.lang_map()
    cmap = dat.code_map()
    valid_pairs = dict()
    for line in csv.DictReader(open(CLPAIRS)):
        lang = words[line['Input.language'].lower()]
        if not lang in valid_pairs:
            valid_pairs[lang] = dict()
        main = cmap[line['Answer.primary_country'].lower()]
        if not main in valid_pairs[lang]:
            valid_pairs[lang][main] = 0
        valid_pairs[lang][main] += 1
        for ctry in [
                cmap[c.lower()] for c in line['Answer.countries'].split('|')
                if not c == ''
        ]:
            if not ctry in valid_pairs[lang]:
                valid_pairs[lang][ctry] = 0
            valid_pairs[lang][ctry] += 1
    ret = dict()
    for p in valid_pairs:
        ret[p] = list()
        for pp in valid_pairs[p]:
            if valid_pairs[p][pp] >= 2:
                ret[p].append(pp)
    return ret
def get_language_dicts(path,
                       qual_cutoff=None,
                       strict=False,
                       filter_list=None,
                       match=True):
    all_dicts = dict()
    tmap = turker_map()
    hitmap = hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    quals = qual_map()
    tquals = turker_qual_map()
    filtered = 0
    for cnt, assign in enumerate(open(path).readlines()):
        comps = assign.strip().split('\t')
        aid = comps[0]
        if cnt % 1000 == 0:
            print aid
        if (aid == ''):  # or aid not in tmap):
            continue
        #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip
        if (match):
            list_pass = not (filter_list == None) and (aid not in filter_list)
        else:
            list_pass = not (filter_list == None) and (aid in filter_list)
        if (list_pass):
            filtered += 1
            continue
        #if strict, assignments must be greater than qual_cutoff to pass, otherwise, can be greater or equal to qual_cutoff
        if (strict):
            qual_pass = not (qual_cutoff == None) and (
                quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)
        else:
            qual_pass = not (qual_cutoff == None) and (
                quals[aid] == 'N/A' or quals[aid] < qual_cutoff)
        if (qual_pass):
            print 'below cutoff', aid, quals[aid]
            continue
        alang = langids[hitlangs[hitmap[aid]]]
        if not alang in all_dicts:
            all_dicts[alang] = dict()
        for pair in comps[1:]:
            try:
                word = pair[:pair.index(':')]
                trans = pair[pair.index(':') + 1:]
            except ValueError:
                print word
                continue
            if not (trans.strip() == ''):
                if not word in all_dicts[alang]:
                    all_dicts[alang][word] = [trans.strip().lower()]
                else:
                    all_dicts[alang][word].append(trans.strip().lower())
    print 'FILTERED %d ASSIGNMENTS' % filtered
    return all_dicts
示例#5
0
def write_all_syns():
    words = word_map(get_lang=True)
    data = {}
    syns = dict()
    numlangmap, langmap = dat.lang_map()
    for word_id in words:
        word, orig, lang = words[word_id]
        lang = numlangmap[lang]
        if lang not in syns:
            syns[lang] = list()
        syns[lang].append((orig, word))
    write_control_dicts(syns)
示例#6
0
def get_good_and_bad_translations(path):
    controls = dict()
    numlangmap, langmap = dat.lang_map()
    gooda = get_syns_quality_by_assign('%s/syn_hits_results' % RAW_DIR)
    good = get_syns_quality_by_turker('%s/syn_hits_results' % RAW_DIR, gooda)
    syns = read_all_syns(filter_list=good, exact_match_only=False)
    words = word_map(get_lang=True)
    data = {}
    for line in csv.DictReader(open(path)):
        assign = line['assignment_id']
        translation = line['translation'].strip().lower()
        word_id = line['word_id']
        if (assign not in data):
            data[assign] = {'total': 'N/A', 'syns': 'N/A'}
        if word_id in words:
            word, orig, lang = words[word_id]
            word = word.strip().lower()
            lang = numlangmap[lang]
            if word in syns:
                if lang not in controls:
                    controls[lang] = dict()
                if orig not in controls[lang]:
                    controls[lang][orig] = {'pos': set(), 'neg': set()}
                if translation in syns[word]:
                    controls[lang][orig]['pos'].add(translation)
                else:
                    controls[lang][orig]['neg'].add(translation)
            else:
                print 'Could not find', word, 'in synonym dictionary. Skipping.'
        else:
            print 'Could not find', word_id, 'in word dictionary. Skipping'

    pos = dict()
    neg = dict()
    for lang in controls:
        pos[lang] = [(orig, string.join(controls[lang][orig]['pos'], ','))
                     for orig in controls[lang]
                     if len(controls[lang][orig]['pos']) > 0]
        neg[lang] = [(orig, string.join(controls[lang][orig]['neg'], ','))
                     for orig in controls[lang]
                     if len(controls[lang][orig]['neg']) > 0]
    write_control_dicts(pos, file_prefix='new-output/poscontrols/dictionary')
    write_control_dicts(neg, file_prefix='new-output/negcontrols/dictionary')
示例#7
0
def get_language_dicts(path, filter_list=None):
    all_dicts = dict()
    hitmap = dictionaries.hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    timemap = time_map()
    for assign in csv.DictReader(open(path), delimiter='\t'):
        aid = assign['id']
        if (aid == ''):
            continue
        if (not (filter_list == None) and (aid not in filter_list)):
            continue
        alang = langids[hitlangs[hitmap[aid]]]
        if not alang in all_dicts:
            all_dicts[alang] = timemap[aid]
        tstart, tend = timemap[aid]
        all_dicts[alang] = (min(all_dicts[alang][0],
                                tstart), max(all_dicts[alang][1], tend))
    return all_dicts
示例#8
0
def format_for_time_series(data):
    print "More formatting data"
    hitmap = dictionaries.hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    all_times = dict()
    for aid, complete, start, total in data:
        lang = langids[hitlangs[hitmap[aid]]]
        if lang not in all_times:
            all_times[lang] = list()
        if lang == 'ru' or lang == 'ur':
            all_times[lang].append(total + datetime.timedelta(days=2))
        else:
            all_times[lang].append(total)
    try:
        all_times.pop('en')
    except KeyError:
        pass
    return all_times
def count_turkers_verbose(path,
                          qual_cutoff=None,
                          strict=False,
                          filter_list=None,
                          match=True):
    all_dicts = dict()
    hitmap = hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    tmap = turker_map()
    quals = qual_map()
    assignmap = assign_dict()
    for assign in open(path).readlines():
        comps = assign.strip().split('\t')
        aid = comps[0]
        if aid == '' or aid not in assignmap:
            continue
#if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip
        if (match):
            list_pass = not (filter_list == None) and (aid not in filter_list)
        else:
            list_pass = not (filter_list == None) and (aid in filter_list)
        if (list_pass):
            continue
#if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff
        if (strict):
            qual_pass = not (qual_cutoff == None) and (
                quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)
        else:
            qual_pass = not (qual_cutoff == None) and (
                quals[aid] == 'N/A' or quals[aid] < qual_cutoff)
        if qual_pass:
            print 'below cutoff', aid, quals[aid]
            continue
        alang = langids[hitlangs[hitmap[aid]]]
        country = assignmap[aid]['country']
        if not (alang in all_dicts):
            all_dicts[alang] = dict()
        if country not in all_dicts[alang]:
            all_dicts[alang][country] = list()
        all_dicts[alang][country].append(tmap[aid])
    return all_dicts
示例#10
0
def get_goog_match_by_assign(path):
    matches = get_goog_translations()
    pickle.dump(matches, open('googmatch-old.pkl', 'w'))
    exit(0)
    words = word_map()
    data = {}
    numlangmap, langmap = dat.lang_map()
    hitlangs = dat.hits_language()
    hits = dictionaries.hit_map()
    for line in csv.DictReader(open(path)):
        assign = line['assignment_id']
        translation = line['translation']
        word_id = line['word_id']
        if (assign not in data):
            data[assign] = {'total': 'N/A', 'syns': 'N/A'}
        assign_lang = numlangmap[hitlangs[hits[assign]]]
        if (assign_lang not in goog_langs):
            print "Skipping lang", assign_lang, "not supported by google"
            continue
        if word_id in matches:
            if data[assign]['total'] == 'N/A':
                data[assign] = {'total': 0, 'syns': 0}
            if translation.strip().lower() == matches[word_id].strip().lower():
                data[assign]['syns'] += 1
            data[assign]['total'] += 1
#                else:
#                       print 'Could not find', assign, word_id, 'in google dictionary. Skipping'
    ret = dict()
    for a in data:
        if data[a]['total'] == 0:
            ret[a] = (data[a]['total'], data[a]['syns'], 0)
        elif data[a]['total'] == 'N/A':
            print a, "is N/A"
            ret[a] = (data[a]['total'], data[a]['syns'], 'N/A')
        else:
            ret[a] = (data[a]['total'], data[a]['syns'],
                      float(data[a]['syns']) / data[a]['total'])
    return ret