def evaluate_crubadan_odin(filename): ''' Trains models from the Crubadan data and runs them on Odin ''' print "Loading character features..." trainsetchar = get_features('crubadan', option='char') print "Loading word features..." trainsetword = get_features('crubadan', option='word') print "Loading test data..." labels = [x[0] for x in odin.source_sents()] test = [(Counter( sentence2ngrams(x[1], with_word_boundary=True, option='allgrams')), Counter(x[1].split())) for x in odin.source_sents()] print "Calculating results..." with open(filename, 'w') as f: f.write(' '.join(labels) + '\n') labels = None for lang in sorted(trainsetchar.keys()): print lang charresult = lang wordresult = lang modelchar = SGT(trainsetchar.pop(lang)) modelword = SGT(trainsetword.pop(lang)) for sentence in test: charresult += ' ' + float.hex(modelchar.estimate(sentence[0])) wordresult += ' ' + float.hex(modelword.estimate(sentence[1])) f.write(charresult + '\n') f.write(wordresult + '\n') print "Done!"
def evaluate_crubadan_odin(filename): ''' Trains models from the Crubadan data and runs them on Odin ''' print "Loading character features..." trainsetchar = get_features('crubadan', option='char') print "Loading word features..." trainsetword = get_features('crubadan', option='word') print "Loading test data..." labels = [x[0] for x in odin.source_sents()] test = [(Counter(sentence2ngrams(x[1], with_word_boundary=True, option='allgrams')), Counter(x[1].split())) for x in odin.source_sents()] print "Calculating results..." with open(filename,'w') as f: f.write(' '.join(labels)+'\n') labels = None for lang in sorted(trainsetchar.keys()): print lang charresult = lang wordresult = lang modelchar = SGT(trainsetchar.pop(lang)) modelword = SGT(trainsetword.pop(lang)) for sentence in test: charresult += ' ' + float.hex(modelchar.estimate(sentence[0])) wordresult += ' ' + float.hex(modelword.estimate(sentence[1])) f.write(charresult+'\n') f.write(wordresult+'\n') print "Done!"
def tfidfize(data_source, option='3gram'): # see http://timtrueman.com/a-quick-foray-into-linear-algebra-and-python-tf-idf/ # see http://scikit-learn.org/stable/modules/preprocessing.html from collections import defaultdict import math, os, io import cPickle as pickle tfidf_pickle = ''.join([data_source,'-',option,'-tfidf','.pk']) if os.path.exists(tfidf_pickle): with io.open(tfidf_pickle,'rb') as fin: featureset = pickle.load(fin) else: featureset = defaultdict(dict) _featureset = get_features(data_source, option=option) for lang in _featureset: for gram in _featureset[lang]: tf = _featureset[lang][gram] / float(sum(_featureset[lang].values())) idf = math.log(len(_featureset)) / len([i for i in _featureset if gram in _featureset[i]]) featureset[lang][gram] = tf * idf print 'Calculating TF-IDF for %s please wait patiently...' % data_source print lang, gram, _featureset[lang][gram], tf, idf, tf * idf with io.open(tfidf_pickle,'wb') as fout: pickle.dump(featureset, fout) return featureset
def tfidfize(data_source, option='3gram'): # see http://timtrueman.com/a-quick-foray-into-linear-algebra-and-python-tf-idf/ # see http://scikit-learn.org/stable/modules/preprocessing.html from collections import defaultdict import math, os, io import cPickle as pickle tfidf_pickle = ''.join([data_source, '-', option, '-tfidf', '.pk']) if os.path.exists(tfidf_pickle): with io.open(tfidf_pickle, 'rb') as fin: featureset = pickle.load(fin) else: featureset = defaultdict(dict) _featureset = get_features(data_source, option=option) for lang in _featureset: for gram in _featureset[lang]: tf = _featureset[lang][gram] / float( sum(_featureset[lang].values())) idf = math.log(len(_featureset)) / len( [i for i in _featureset if gram in _featureset[i]]) featureset[lang][gram] = tf * idf print 'Calculating TF-IDF for %s please wait patiently...' % data_source print lang, gram, _featureset[lang][gram], tf, idf, tf * idf with io.open(tfidf_pickle, 'wb') as fout: pickle.dump(featureset, fout) return featureset
def check_data_integrity(data_source="all", remove=True): """ Remove and repickle the extracted feature files and count: i. no. of languages in original source data ii. no. of languages in extracted features """ import os, glob from extractfeature import get_features from universalcorpus import odin, omniglot, udhr, wikipedia if remove: # Remove all/selected pickled files toremove = '*.pk' if data_source == "all" else data_source+"*" for i in glob.glob(toremove): os.remove(i) # Rebuild pickled files. torebuild = ['odin','omniglot','udhr','crubadan','wikipedia'] \ if data_source == 'all' else [data_source] for i in torebuild: print "Accessing features from %s, please wait ..." % (i) charngrams,wordfreq = get_features(i, option=None, shutup=True) print "%s-word.pk contains data for %d Languages." % (i,len(wordfreq)) print "Original source contains data for %d Languages" % \ locals()[i].num_languages() missing = set(locals()[i].languages()) - set(wordfreq.keys()) print "Thrown languages:",missing, "\n"
def check_data_integrity(data_source="all", remove=True): """ Remove and repickle the extracted feature files and count: i. no. of languages in original source data ii. no. of languages in extracted features """ import os, glob from extractfeature import get_features from universalcorpus import odin, omniglot, udhr, wikipedia if remove: # Remove all/selected pickled files toremove = '*.pk' if data_source == "all" else data_source + "*" for i in glob.glob(toremove): os.remove(i) # Rebuild pickled files. torebuild = ['odin','omniglot','udhr','crubadan','wikipedia'] \ if data_source == 'all' else [data_source] for i in torebuild: print "Accessing features from %s, please wait ..." % (i) charngrams, wordfreq = get_features(i, option=None, shutup=True) print "%s-word.pk contains data for %d Languages." % (i, len(wordfreq)) print "Original source contains data for %d Languages" % \ locals()[i].num_languages() missing = set(locals()[i].languages()) - set(wordfreq.keys()) print "Thrown languages:", missing, "\n"
def features2numpy(data_source, option="3gram"): featureset = get_features(data_source, option=option) all_features = list(set(chain(*[i.keys() for i in featureset.values()]))) all_tags = [i for i in featureset] data, target = [], [] for lang in featureset: data.append([featureset[lang][j] for j in all_features]) target.append(lang) # Sanity check ##print [(j,featureset[lang][j]) for j in all_features if featureset[lang][j] > 0] return np.array(data), np.array(target), all_features
def classify_odin(sentence, verbose=True): ''' Given an input string, classifies it based on Odin character n-grams. Effectively an informal test. ''' test = Counter(sentence2ngrams(sentence, with_word_boundary=True, option='allgrams')) trainset = get_features('odin', option='char') sgt_results = [] for lang in trainset: train = trainset[lang] sgt_results.append((SGT(train, min=6000).estimate(test),lang)) sgt_results.sort(reverse=True) if verbose: for i in sgt_results[:10]: print i return sgt_results
def classify_odin(sentence, verbose=True): ''' Given an input string, classifies it based on Odin character n-grams. Effectively an informal test. ''' test = Counter( sentence2ngrams(sentence, with_word_boundary=True, option='allgrams')) trainset = get_features('odin', option='char') sgt_results = [] for lang in trainset: train = trainset[lang] sgt_results.append((SGT(train, min=6000).estimate(test), lang)) sgt_results.sort(reverse=True) if verbose: for i in sgt_results[:10]: print i return sgt_results
def features2numpy(data_source, option="3gram", tfidf=False): if tfidf: featureset = tfidfize(data_source, option=option) else: featureset = get_features(data_source, option=option) all_features = list(set(chain(*[i.keys() for i in featureset.values()]))) all_tags = [i for i in featureset] data, target = [], [] for lang in featureset: data.append([ featureset[lang][j] if j in featureset[lang] else 0 for j in all_features ]) target.append(lang) # Sanity check ##print [(j,featureset[lang][j]) for j in all_features if featureset[lang][j] > 0] return np.array(data), np.array(target), all_features
def sugarlid_cosine(text, option='3gram', data_source='crubadan'): """ Cosine Vector based sugarlid. """ from cosine import cosine_similarity char_ngrams = get_features(data_source, option=option) ##for i in char_ngrams: ## print char_ngrams[i] #print sentence2ngrams(text, option=option) try: query_vector = " ".join(sentence2ngrams(text, option=option)) except TypeError: query_vector = " ".join(["_".join(i) for i in \ sentence2ngrams(text, option=option)]) print query_vector results = [] for i in char_ngrams: lang_vector = " ".join([str(j+" ")*char_ngrams[i][j] \ for j in char_ngrams[i]]) score = cosine_similarity(query_vector, lang_vector) if score > 0: results.append((score,i)) return sorted(results, reverse=True)
def sugarlid_cosine(text, option='3gram', data_source='crubadan'): """ Cosine Vector based sugarlid. """ from cosine import cosine_similarity char_ngrams = get_features(data_source, option=option) ##for i in char_ngrams: ## print char_ngrams[i] #print sentence2ngrams(text, option=option) try: query_vector = " ".join(sentence2ngrams(text, option=option)) except TypeError: query_vector = " ".join(["_".join(i) for i in \ sentence2ngrams(text, option=option)]) print query_vector results = [] for i in char_ngrams: lang_vector = " ".join([str(j+" ")*char_ngrams[i][j] \ for j in char_ngrams[i]]) score = cosine_similarity(query_vector, lang_vector) if score > 0: results.append((score, i)) return sorted(results, reverse=True)
train = Counter({'a':1,'b':5,'c':2}) test = Counter({'b':1,'a':1}) langSGT = SGT(train) langMLE = MLE(train) print SGTestimate(langSGT,test) print MLEestimate(langMLE,test) ''' from extractfeature import sentence2ngrams, get_features s = "ich bin schwanger" test = Counter(sentence2ngrams(s, with_word_boundary=True)) print test trainset = get_features('odin', option='3gram') sgt_results = [] mle_results = [] ''' german = SGT(trainset['deu']) wakawaka = SGT(trainset['wkw']) for x in test: print x, trainset['deu'][x], SGTestimate(german, Counter({x:1})) print x, trainset['wkw'][x], SGTestimate(wakawaka, Counter({x:1})) print len(trainset['wkw']) ''' for lang in trainset: