def vector_of_language(source_file): opened_file = open(source_file, encoding="utf-8") text = opened_file.read() unigram_probability = ngrams.probability(ngrams.count_ngrams(text,1)) bigram_probability = ngrams.probability_of_bigram(ngrams.count_ngrams(text, 2)) trigram_probability = ngrams.probability_of_trigram(ngrams.count_ngrams(text, 3)) return [unigram_probability, bigram_probability, trigram_probability]
def get_ngrams_rel(filter): path = get_data_file() + "/" files = [f for f in listdir(path) if isfile(join(path, f))] listRel = [ dict(string='', ngrams=dict(), rel=0), dict(string='', ngrams=dict(), rel=1), dict(string='', ngrams=dict(), rel=2), dict(string='', ngrams=dict(), rel=3), dict(string='', ngrams=dict(), rel=4), dict(string='', ngrams=dict(), rel=5), dict(string='', ngrams=dict(), rel=6), dict(string='', ngrams=dict(), rel=7), dict(string='', ngrams=dict(), rel=8), dict(string='', ngrams=dict(), rel=9) ] dictRel = dl.getReliability(files) for file in files: with open(os.path.join(path, file)) as f: data = json.load(f) data = get_string(data) if filter: data = remove_noise(data) rel = dictRel[file] listRel[rel]['string'] += data for d in listRel: d['ngrams'] = ng.count_ngrams(io.StringIO(d['string']), min_length=2, max_length=5) return listRel
def get_ngrams_bias(filter): path = get_data_file() + "/" files = [f for f in listdir(path) if isfile(join(path, f))] listBias = [ dict(string='', ngrams=dict(), bias=0), dict(string='', ngrams=dict(), bias=1), dict(string='', ngrams=dict(), bias=2), dict(string='', ngrams=dict(), bias=3), dict(string='', ngrams=dict(), bias=4), ] dictBias = dl.getBias(files) for file in files: with open(os.path.join(path, file)) as f: data = json.load(f) data = get_string(data) if filter: data = remove_noise(data) bias = dictBias[file] if bias == 0: continue listBias[bias]['string'] += data for d in listBias: d['ngrams'] = ng.count_ngrams(io.StringIO(d['string']), min_length=2, max_length=5) return listBias
def vector_of_language(source_file): opened_file = open(source_file, encoding="utf-8") unigrams = [{},{}] bigrams = [{},{}] trigrams = [{},{}] for line in opened_file: unigrams[1] = ngrams.count_ngrams(line,1) unigrams[0] = sum((collections.Counter(dict(lines)) for lines in unigrams), collections.Counter()) bigrams[1] = ngrams.count_ngrams(line,2) bigrams[0] = sum((collections.Counter(dict(lines)) for lines in bigrams), collections.Counter()) trigrams[1] = ngrams.count_ngrams(line,3) trigrams[0] = sum((collections.Counter(dict(lines)) for lines in trigrams), collections.Counter()) unigram_probability = ngrams.probability(unigrams[0]) bigram_probability = ngrams.probability_of_bigram(bigrams[0]) trigram_probability = ngrams.probability_of_trigram(trigrams[0]) return [unigram_probability, bigram_probability, trigram_probability]