def test_get_indices_1grams(unigram_indices): indices = list(get_indices(1)) assert len(set(indices)) == len(indices) assert set(indices) == unigram_indices
def test_get_indices_4grams_coverage_1m(quadgram_indices_1m): indices = list(get_indices(4, coverage="1M")) assert len(set(indices)) == len(indices) assert set(indices) == quadgram_indices_1m
def test_get_indices_5grams_coverage_1m(fivegram_indices_1m): indices = list(get_indices(5, coverage="1M")) assert len(set(indices)) == len(indices) assert set(indices) == fivegram_indices_1m
def test_get_indices_2grams_coverage_1m(bigram_indices_1m): indices = list(get_indices(2, coverage="1M")) assert len(set(indices)) == len(indices) assert set(indices) == bigram_indices_1m
def test_get_indices_3grams_coverage_1m(trigram_indices_1m): indices = list(get_indices(3, coverage="1M")) assert len(set(indices)) == len(indices) assert set(indices) == trigram_indices_1m
def test_get_indices_5grams(bigrams_indices): """Check that there is no "qk" index for 5grams.""" indices = list(get_indices(5)) assert len(indices) == len(set(bigrams_indices)) - 1 assert set(indices) == (bigrams_indices - set(['qk']))
def test_get_indices_1grams_coverage_1m(unigram_indices_1m): indices = list(get_indices(1, coverage="1M")) assert len(set(indices)) == len(indices) assert set(indices) == unigram_indices_1m
def test_get_indices_manygrams(bigrams_indices): indices = list(get_indices(2)) assert len(set(indices)) == len(indices) assert set(indices) == bigrams_indices
import csv import os import string from google_ngram_downloader import readline_google_store, util list_not = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_ADJ_', '_ADP_', '_ADV_', '_CONJ_', '_DET_', '_NOUN_', '_NUM_', '_PRON_', '_PRT_', '_VERB_' ] ngrams = 3 result = {} list_indices = util.get_indices(ngrams) dict_ngram = {} for item in list_indices: if not (item in list_not): list_tmp = [] list_tmp.append(item) try: fnames, urls, records = next( readline_google_store(ngram_len=ngrams, indices=list_tmp, lang='spa')) for i in records: try: ngram = str(i.ngram).lower() # print(i) if ngram.find('_') == -1: if ngram in dict_ngram: temp = dict_ngram.get(ngram) freq = float(temp['freq'] + i.match_count) count = temp['count'] + 1