def get_language_dicts_quals_turkers(path, qual_cutoff=None, strict=False, filter_list=None, match=True): all_dicts = dict() tmap = turker_map() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() quals = qual_map() tquals = turker_qual_map() for assign in open(path).readlines(): comps = assign.strip().split('\t') aid = comps[0] if(aid == '' or aid not in tmap): continue #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip if(match): list_pass = not(filter_list == None) and (aid not in filter_list) else: list_pass = not(filter_list == None) and (aid in filter_list) if(list_pass): continue #if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff if(strict): qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff) else: qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff) if(qual_pass): print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] if not(alang in all_dicts): all_dicts[alang] = {'num':0, 'denom':0} all_dicts[alang]['num'] += tquals[tmap[aid]] all_dicts[alang]['denom'] += 1 return all_dicts
def get_language_dicts_quals(path, qual_cutoff=None, strict=False, filter_list=None): all_dicts = dict() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() quals = qual_map() for assign in open(path).readlines(): comps = assign.strip().split('\t') aid = comps[0] if (aid == ''): continue if (not (filter_list == None) and (aid not in filter_list)): continue if (not (qual_cutoff == None) and strict and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)): print 'below cutoff', aid, quals[aid] continue if (not (qual_cutoff == None) and not (strict) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)): print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] if not (alang in all_dicts): all_dicts[alang] = {'num': 0, 'denom': 0} all_dicts[alang]['num'] += quals[aid] all_dicts[alang]['denom'] += 1 return all_dicts
def read_valid_clpairs(): codes, words = dat.lang_map() cmap = dat.code_map() valid_pairs = dict() for line in csv.DictReader(open(CLPAIRS)): lang = words[line['Input.language'].lower()] if not lang in valid_pairs: valid_pairs[lang] = dict() main = cmap[line['Answer.primary_country'].lower()] if not main in valid_pairs[lang]: valid_pairs[lang][main] = 0 valid_pairs[lang][main] += 1 for ctry in [ cmap[c.lower()] for c in line['Answer.countries'].split('|') if not c == '' ]: if not ctry in valid_pairs[lang]: valid_pairs[lang][ctry] = 0 valid_pairs[lang][ctry] += 1 ret = dict() for p in valid_pairs: ret[p] = list() for pp in valid_pairs[p]: if valid_pairs[p][pp] >= 2: ret[p].append(pp) return ret
def get_language_dicts(path, qual_cutoff=None, strict=False, filter_list=None, match=True): all_dicts = dict() tmap = turker_map() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() quals = qual_map() tquals = turker_qual_map() filtered = 0 for cnt, assign in enumerate(open(path).readlines()): comps = assign.strip().split('\t') aid = comps[0] if cnt % 1000 == 0: print aid if (aid == ''): # or aid not in tmap): continue #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip if (match): list_pass = not (filter_list == None) and (aid not in filter_list) else: list_pass = not (filter_list == None) and (aid in filter_list) if (list_pass): filtered += 1 continue #if strict, assignments must be greater than qual_cutoff to pass, otherwise, can be greater or equal to qual_cutoff if (strict): qual_pass = not (qual_cutoff == None) and ( quals[aid] == 'N/A' or quals[aid] <= qual_cutoff) else: qual_pass = not (qual_cutoff == None) and ( quals[aid] == 'N/A' or quals[aid] < qual_cutoff) if (qual_pass): print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] if not alang in all_dicts: all_dicts[alang] = dict() for pair in comps[1:]: try: word = pair[:pair.index(':')] trans = pair[pair.index(':') + 1:] except ValueError: print word continue if not (trans.strip() == ''): if not word in all_dicts[alang]: all_dicts[alang][word] = [trans.strip().lower()] else: all_dicts[alang][word].append(trans.strip().lower()) print 'FILTERED %d ASSIGNMENTS' % filtered return all_dicts
def write_all_syns(): words = word_map(get_lang=True) data = {} syns = dict() numlangmap, langmap = dat.lang_map() for word_id in words: word, orig, lang = words[word_id] lang = numlangmap[lang] if lang not in syns: syns[lang] = list() syns[lang].append((orig, word)) write_control_dicts(syns)
def get_good_and_bad_translations(path): controls = dict() numlangmap, langmap = dat.lang_map() gooda = get_syns_quality_by_assign('%s/syn_hits_results' % RAW_DIR) good = get_syns_quality_by_turker('%s/syn_hits_results' % RAW_DIR, gooda) syns = read_all_syns(filter_list=good, exact_match_only=False) words = word_map(get_lang=True) data = {} for line in csv.DictReader(open(path)): assign = line['assignment_id'] translation = line['translation'].strip().lower() word_id = line['word_id'] if (assign not in data): data[assign] = {'total': 'N/A', 'syns': 'N/A'} if word_id in words: word, orig, lang = words[word_id] word = word.strip().lower() lang = numlangmap[lang] if word in syns: if lang not in controls: controls[lang] = dict() if orig not in controls[lang]: controls[lang][orig] = {'pos': set(), 'neg': set()} if translation in syns[word]: controls[lang][orig]['pos'].add(translation) else: controls[lang][orig]['neg'].add(translation) else: print 'Could not find', word, 'in synonym dictionary. Skipping.' else: print 'Could not find', word_id, 'in word dictionary. Skipping' pos = dict() neg = dict() for lang in controls: pos[lang] = [(orig, string.join(controls[lang][orig]['pos'], ',')) for orig in controls[lang] if len(controls[lang][orig]['pos']) > 0] neg[lang] = [(orig, string.join(controls[lang][orig]['neg'], ',')) for orig in controls[lang] if len(controls[lang][orig]['neg']) > 0] write_control_dicts(pos, file_prefix='new-output/poscontrols/dictionary') write_control_dicts(neg, file_prefix='new-output/negcontrols/dictionary')
def get_language_dicts(path, filter_list=None): all_dicts = dict() hitmap = dictionaries.hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() timemap = time_map() for assign in csv.DictReader(open(path), delimiter='\t'): aid = assign['id'] if (aid == ''): continue if (not (filter_list == None) and (aid not in filter_list)): continue alang = langids[hitlangs[hitmap[aid]]] if not alang in all_dicts: all_dicts[alang] = timemap[aid] tstart, tend = timemap[aid] all_dicts[alang] = (min(all_dicts[alang][0], tstart), max(all_dicts[alang][1], tend)) return all_dicts
def format_for_time_series(data): print "More formatting data" hitmap = dictionaries.hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() all_times = dict() for aid, complete, start, total in data: lang = langids[hitlangs[hitmap[aid]]] if lang not in all_times: all_times[lang] = list() if lang == 'ru' or lang == 'ur': all_times[lang].append(total + datetime.timedelta(days=2)) else: all_times[lang].append(total) try: all_times.pop('en') except KeyError: pass return all_times
def count_turkers_verbose(path, qual_cutoff=None, strict=False, filter_list=None, match=True): all_dicts = dict() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() tmap = turker_map() quals = qual_map() assignmap = assign_dict() for assign in open(path).readlines(): comps = assign.strip().split('\t') aid = comps[0] if aid == '' or aid not in assignmap: continue #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip if (match): list_pass = not (filter_list == None) and (aid not in filter_list) else: list_pass = not (filter_list == None) and (aid in filter_list) if (list_pass): continue #if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff if (strict): qual_pass = not (qual_cutoff == None) and ( quals[aid] == 'N/A' or quals[aid] <= qual_cutoff) else: qual_pass = not (qual_cutoff == None) and ( quals[aid] == 'N/A' or quals[aid] < qual_cutoff) if qual_pass: print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] country = assignmap[aid]['country'] if not (alang in all_dicts): all_dicts[alang] = dict() if country not in all_dicts[alang]: all_dicts[alang][country] = list() all_dicts[alang][country].append(tmap[aid]) return all_dicts
def get_goog_match_by_assign(path): matches = get_goog_translations() pickle.dump(matches, open('googmatch-old.pkl', 'w')) exit(0) words = word_map() data = {} numlangmap, langmap = dat.lang_map() hitlangs = dat.hits_language() hits = dictionaries.hit_map() for line in csv.DictReader(open(path)): assign = line['assignment_id'] translation = line['translation'] word_id = line['word_id'] if (assign not in data): data[assign] = {'total': 'N/A', 'syns': 'N/A'} assign_lang = numlangmap[hitlangs[hits[assign]]] if (assign_lang not in goog_langs): print "Skipping lang", assign_lang, "not supported by google" continue if word_id in matches: if data[assign]['total'] == 'N/A': data[assign] = {'total': 0, 'syns': 0} if translation.strip().lower() == matches[word_id].strip().lower(): data[assign]['syns'] += 1 data[assign]['total'] += 1 # else: # print 'Could not find', assign, word_id, 'in google dictionary. Skipping' ret = dict() for a in data: if data[a]['total'] == 0: ret[a] = (data[a]['total'], data[a]['syns'], 0) elif data[a]['total'] == 'N/A': print a, "is N/A" ret[a] = (data[a]['total'], data[a]['syns'], 'N/A') else: ret[a] = (data[a]['total'], data[a]['syns'], float(data[a]['syns']) / data[a]['total']) return ret