def count_frequencies(lang_id, corpus_dir): """ Counts word count en document count for each word in the corpus. :param lang_id: :param corpus_dir: directory with text files :return: """ nlp = spacy.load(lang_id) vocab = nlp.vocab tokenizer = nlp.tokenizer counts = PreshCounter() doccounts = PreshCounter() for filename in os.listdir(corpus_dir): with codecs.open(os.path.join(corpus_dir, filename), encoding='utf-8') as f: data = f.read() doc = tokenizer(data) doc.count_by(ORTH, counts=counts) doccount = doc.count_by(ORTH) for k, v in doccount.iteritems(): doccounts.inc(k, 1) return counts, doccounts, tokenizer
class Corpus(object): def __init__(self, directory, min_freq=10): self.directory = directory self.counts = PreshCounter() self.strings = {} self.min_freq = min_freq def count_doc(self, words): # Get counts for this document doc_counts = PreshCounter() doc_strings = {} for word in words: key = hash_string(word) doc_counts.inc(key, 1) doc_strings[key] = word n = 0 for key, count in doc_counts: self.counts.inc(key, count) # TODO: Why doesn't inc return this? =/ corpus_count = self.counts[key] # Remember the string when we exceed min count if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq: self.strings[key] = doc_strings[key] n += count return n def __iter__(self): for text_loc in iter_dir(self.directory): with io.open(text_loc, 'r', encoding='utf8') as file_: sent_strs = list(file_) random.shuffle(sent_strs) for sent_str in sent_strs: yield sent_str.split()
def countTheWords(doc): counts = PreshCounter() for word in doc: counts.inc(word.orth, 1) for (word_id, count) in counts: print(count, nlp.vocab.strings[word_id]) return counts
def test_unsmooth_prob(): counter = PreshCounter() assert counter.prob(12) == 0.0 counter.inc(12, 1) assert counter.prob(12) == 1.0 counter.inc(14, 10) assert counter.prob(14) == 10 / 11 assert counter.prob(12) == 1.0 / 11
def merge_counts(locs, out_loc): string_map = StringStore() counts = PreshCounter() for loc in locs: with codecs.open(loc, 'r', 'utf8') as file_: for line in file_: freq, word = line.strip().split('\t', 1) orth = string_map[word] counts.inc(orth, int(freq)) with codecs.open(out_loc, 'w', 'utf8') as file_: for orth, count in counts: string = string_map[orth] file_.write('%d\t%s\n' % (count, string))
def merge_counts(locs, out_loc): string_map = StringStore() counts = PreshCounter() for loc in locs: with io.open(loc, 'r', encoding='utf8') as file_: for line in file_: freq, word = line.strip().split('\t', 1) orth = string_map[word] counts.inc(orth, int(freq)) with io.open(out_loc, 'w', encoding='utf8') as file_: for orth, count in counts: string = string_map[orth] file_.write('%d\t%s\n' % (count, string))
def get_features(docs, n_docs, max_length=100): Xs = np.zeros((n_docs, max_length), dtype=np.int32) counts = PreshCounter() for i, doc in enumerate(docs): doc.count_by for j, token in enumerate(doc[:max_length]): if token.has_vector: Xs[i, j] = token.rank counts.inc(token.rank, 1) else: Xs[i, j] = 0 return Xs, counts
def _read_probs_from_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): if not loc.exists(): print("Warning: Frequencies file not found") return {}, 0.0 counts = PreshCounter() total = 0 if str(loc).endswith('gz'): file_ = gzip.open(str(loc)) else: file_ = loc.open() for i, line in enumerate(file_): freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) if str(loc).endswith('gz'): file_ = gzip.open(str(loc)) else: file_ = loc.open() probs = {} for line in file_: freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len( key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): print("Counting frequencies...") counts = PreshCounter() total = 0 with freqs_loc.open() as f: for i, line in enumerate(f): freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} with freqs_loc.open() as f: for line in tqdm(f): freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len( key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): counts = PreshCounter() total = 0 with freqs_loc.open() as f: for i, line in enumerate(f): freq, doc_freq, key = line.rstrip().split("\t", 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} with freqs_loc.open() as f: for line in tqdm(f): freq, doc_freq, key = line.rstrip().split("\t", 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len( key) < max_length: try: word = literal_eval(key) except SyntaxError: # Take odd strings literally. word = literal_eval("'%s'" % key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
class Corpus(object): def __init__(self, directory): self.directory = directory self.counts = PreshCounter() def count_doc(self, doc): for word in doc: self.counts.inc(word.orth, 1) def __iter__(self): for text_loc in self.directory.rglob("*.txt"): with io.open(text_loc, "r", encoding="utf8") as file_: text = file_.read() yield text
def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): if not loc.exists(): print("Warning: Frequencies file not found") return {}, 0.0 counts = PreshCounter() total = 0 if str(loc).endswith('gz'): file_ = gzip.open(str(loc)) else: file_ = loc.open() for i, line in enumerate(file_): freq, doc_freq, key = line.split('\t', 2) freq = int(freq) counts.inc(i+1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} for line in loc.open(): freq, doc_freq, key = line.split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) log_smooth_count = math.log(smooth_count) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): counts = PreshCounter() total = 0 with freqs_loc.open() as f: for i, line in enumerate(f): freq, doc_freq, key = line.rstrip().split("\t", 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} with freqs_loc.open() as f: for line in tqdm(f): freq, doc_freq, key = line.rstrip().split("\t", 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: try: word = literal_eval(key) except SyntaxError: # Take odd strings literally. word = literal_eval("'%s'" % key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def main(patterns_loc, text_loc, counts_loc, n=10000000): nlp = English(parser=False, tagger=False, entity=False) print("Make matcher") phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n) counts = PreshCounter() t1 = time.time() for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)): counts.inc(hash_string(mwe.text), 1) t2 = time.time() print("10m tokens in %d s" % (t2 - t1)) with codecs.open(counts_loc, 'w', 'utf8') as file_: for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n): text = phrase.string key = hash_string(text) count = counts[key] if count != 0: file_.write('%d\t%s\n' % (count, text))
def test_large_freqs(): if 'TEST_FILE_LOC' in os.environ: loc = os.environ['TEST_FILE_LOC'] else: return None counts = PreshCounter() for i, line in enumerate(open(loc)): line = line.strip() if not line: continue freq = int(line.split()[0]) counts.inc(i + 1, freq) oov = i + 2 assert counts.prob(oov) == 0.0 assert counts.prob(1) < 0.1 counts.smooth() assert counts.prob(oov) > 0 assert counts.prob(oov) < counts.prob(i)
class Corpus(object): def __init__(self, directory, min_freq=10): self.directory = directory self.counts = PreshCounter() self.strings = {} self.min_freq = min_freq def count_doc(self, doc): # Get counts for this document for word in doc: self.counts.inc(word.orth, 1) return len(doc) def __iter__(self): for text_loc in iter_dir(self.directory): with text_loc.open("r", encoding="utf-8") as file_: text = file_.read() yield text
class Corpus(object): def __init__(self, directory, min_freq=10): self.directory = directory self.counts = PreshCounter() self.strings = {} self.min_freq = min_freq def count_doc(self, doc): # Get counts for this document for word in doc: self.counts.inc(word.orth, 1) return len(doc) def __iter__(self): for text_loc in iter_dir(self.directory): with io.open(text_loc, 'r', encoding='utf8') as file_: text = file_.read() yield text
def test_count(): counter = PreshCounter() assert counter[12] == 0 counter.inc(12, 1) assert counter[12] == 1 counter.inc(14, 10) counter.inc(9, 10) counter.inc(12, 4) assert counter[12] == 5 assert counter[14] == 10 assert counter[9] == 10
def count_doc(self, words): # Get counts for this document doc_counts = PreshCounter() doc_strings = {} for word in words: key = hash_string(word) doc_counts.inc(key, 1) doc_strings[key] = word n = 0 for key, count in doc_counts: self.counts.inc(key, count) # TODO: Why doesn't inc return this? =/ corpus_count = self.counts[key] # Remember the string when we exceed min count if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq: self.strings[key] = doc_strings[key] n += count return n
def count_doc(self, words): doc_counts = PreshCounter() doc_strings = {} for word in words: key = hash_string(word) doc_counts.inc(key, 1) doc_strings[key] = word n = 0 for key, count in doc_counts: self.counts.inc(key, count) corpus_count = self.counts[key] if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq: self.strings[key] = doc_strings[key] n += count return n
class Corpus(object): def __init__(self, directory, min_freq=10): self.directory = directory self.counts = PreshCounter() self.strings = {} self.min_freq = min_freq def count_doc(self, words): doc_counts = PreshCounter() doc_strings = {} for word in words: key = hash_string(word) doc_counts.inc(key, 1) doc_strings[key] = word n = 0 for key, count in doc_counts: self.counts.inc(key, count) corpus_count = self.counts[key] if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq: self.strings[key] = doc_strings[key] n += count return n def __iter__(self): for text_loc in trainModel.iterDir(self.directory): with io.open(text_loc, 'r', encoding='utf8') as file_: sent_strs = list(file_) random.shuffle(sent_strs) for sent_str in sent_strs: yield sent_str.split()
def process(batch_id, inputs, output_dir, lang, n_threads, batch_size, min_ngram, max_ngram): logging.info('Processing batch_id: {}'.format(batch_id)) subtrees = PreshCounter() subtrees_string_map = StringStore() noun_chunks = PreshCounter() noun_chunks_string_map = StringStore() if lang.lower() == "en": from spacy.en import English NLU = English() NLU.matcher = None elif lang.lower() == "id": from spacy.id import Indonesian NLU = Indonesian() NLU.matcher = None for i, doc in enumerate( NLU.pipe(inputs, batch_size=batch_size, n_threads=n_threads)): phrases = set() for tok in doc: st_len = len(list(tok.subtree)) if min_ngram <= st_len <= max_ngram: st = ''.join([rep_text(t.text_with_ws) for t in tok.subtree]).strip() orth = subtrees_string_map[st] subtrees.inc(orth, 1) for np in doc.noun_chunks: if min_ngram <= len(np) <= max_ngram: st = ''.join([rep_text(t.text_with_ws) for t in np]).strip() orth = noun_chunks_string_map[st] noun_chunks.inc(orth, 1) if i % batch_size == 0: logging.info('Processing batch_id: {}, doc: {}'.format( batch_id, i)) output_fname = path.join(output_dir, 'batch{}.st.freq'.format(batch_id)) with io.open(output_fname, 'w', encoding='utf-8') as out: for orth, count in subtrees: st = subtrees_string_map[orth] if count >= 5 and '!LONGWORD!' not in st: out.write('{}\t{}\n'.format(count, st)) output_fname = path.join(output_dir, 'batch{}.np.freq'.format(batch_id)) with io.open(output_fname, 'w', encoding='utf-8') as out: for orth, count in noun_chunks: if count >= 5: st = noun_chunks_string_map[orth] out.write('{}\t{}\n'.format(count, st))
def test_large_freqs(): if 'TEST_FILE_LOC' in os.environ: loc = os.environ['TEST_FILE_LOC'] else: return None counts = PreshCounter() for i, line in enumerate(open(loc)): line = line.strip() if not line: continue freq = int(line.split()[0]) counts.inc(i+1, freq) oov = i+2 assert counts.prob(oov) == 0.0 assert counts.prob(1) < 0.1 counts.smooth() assert counts.prob(oov) > 0 assert counts.prob(oov) < counts.prob(i)
def count_freqs(Language, input_loc, output_loc): tokenizer = Language.Defaults.create_tokenizer() counts = PreshCounter() for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with io.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: string = tokenizer.vocab.strings[orth] if not string.isspace(): file_.write('%d\t%s\n' % (freq, string))
def count_freqs(input_loc, output_loc): print output_loc nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False) nlp.vocab.lexeme_props_getter = null_props counts = PreshCounter() tokenizer = nlp.tokenizer for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with codecs.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: string = nlp.vocab.strings[orth] file_.write('%d\t%s\n' % (freq, repr(string)))
def count_freqs(input_loc, output_loc): print(output_loc) vocab = English.default_vocab(get_lex_attr=None) tokenizer = Tokenizer.from_dir( vocab, path.join(English.default_data_dir(), 'tokenizer')) counts = PreshCounter() for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with codecs.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: string = tokenizer.vocab.strings[orth] if not string.isspace(): file_.write('%d\t%s\n' % (freq, string))
def _tokenize(language, path, output_path): ''' Create a tokenizer function ''' counter = PreshCounter() defaults = spacy.blank(language).Defaults rules = defaults.tokenizer_exceptions token_match = defaults.token_match prefix_search = (spacy.util.compile_prefix_regex(defaults.prefixes).search if defaults.prefixes else None) suffix_search = (spacy.util.compile_suffix_regex(defaults.suffixes).search if defaults.suffixes else None) # Correct for the fact that spacy does not preserve infix hyphens by default punct = r'?";:=,.' ignore_infix = r'(?<=[{a}])[{p}]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS, p=punct) # Correctly support ending sentences: 1) Start of token followed by a punct, 2) # punct followed by punct, 3) two consecutive non-punct chars followed by a # punct char, 4) Start of token followed by non-punct char, followed by punct punct_infix = r'^[{p}]|(?<=[{p}])[{p}]|(?<=[^{p}]{{2}})[{p}]|(?<=^[^{p}])[{p}]$'.format(p=punct) infixes = ( [punct_infix] + [infix for infix in defaults.infixes if infix != ignore_infix] ) infix_finditer = spacy.util.compile_infix_regex(infixes).finditer tokenizer = spacy.tokenizer.Tokenizer( defaults.create_vocab(), rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match ) with ExitStack() as stack: input_file = stack.enter_context(open(path, 'rt')) output_file = stack.enter_context(open(output_path, 'wt')) for text in tokenizer.pipe(input_file): tokenized = ' '.join([token.text for token in text if not token.is_space]) text.count_by(spacy.attrs.ORTH, counts=counter) output_file.write(tokenized + '\n') return Counter({ tokenizer.vocab[i].text: c for i, c in counter if not tokenizer.vocab[i].is_space })
def count_freqs(input_loc, output_loc, LangClass): start = time.time() print('INFO: Processing ', input_loc) vocab = LangClass.Defaults.create_vocab() tokenizer = LangClass.Defaults.create_tokenizer() #Tokenizer(vocab,path.join(LangClass.default_data_dir(), 'tokenizer')) counts = PreshCounter() for text in iter_comments(input_loc): doc = tokenizer(text) doc.count_by(ORTH, counts=counts) with io.open(output_loc, 'w', encoding='utf8') as file_: for orth, freq in counts: string = tokenizer.vocab.strings[orth] if not string.isspace(): file_.write('%d\t%s\n' % (freq, string)) end = time.time() - start print('INFO: File {} took {} min '.format(input_loc, end / 60))
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200): counts = PreshCounter() total = 0 freqs_file = check_unzip(freqs_path) for i, line in enumerate(freqs_file): freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i+1, freq) total += freq counts.smooth() log_total = math.log(total) freqs_file = check_unzip(freqs_path) probs = {} for line in freqs_file: freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): print("Counting frequencies...") counts = PreshCounter() total = 0 with freqs_loc.open() as f: for i, line in enumerate(f): freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} with freqs_loc.open() as f: for line in tqdm(f): freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def __init__(self, directory, min_freq=10): self.directory = directory self.counts = PreshCounter() self.strings = {} self.min_freq = min_freq
def __init__(self, directory): self.directory = directory self.counts = PreshCounter()
def test_smooth_prob(): p = PreshCounter() # 1 10 # 2 6 # 3 4 # 5 2 # 8 1 for i in range(10): p.inc(100-i, 1) # 10 items of freq 1 for i in range(6): p.inc(90 - i, 2) # 6 items of freq 2 for i in range(4): p.inc(80 - i, 3) # 4 items of freq 3 for i in range(2): p.inc(70 - i, 5) # 2 items of freq 5 for i in range(1): p.inc(60 - i, 8) # 1 item of freq 8 assert p.total == (10 * 1) + (6 * 2) + (4 * 3) + (2 * 5) + (1 * 8) assert p.prob(100) == 1.0 / p.total assert p.prob(200) == 0.0 assert p.prob(60) == 8.0 / p.total p.smooth() assert p.smoother(1) < 1.0 assert p.smoother(8) < 8.0 assert p.prob(1000) < p.prob(100) for event, count in reversed(sorted(p, key=lambda it: it[1])): assert p.smoother(count) < count
def test_smooth_prob(): p = PreshCounter() # 1 10 # 2 6 # 3 4 # 5 2 # 8 1 for i in range(10): p.inc(100 - i, 1) # 10 items of freq 1 for i in range(6): p.inc(90 - i, 2) # 6 items of freq 2 for i in range(4): p.inc(80 - i, 3) # 4 items of freq 3 for i in range(2): p.inc(70 - i, 5) # 2 items of freq 5 for i in range(1): p.inc(60 - i, 8) # 1 item of freq 8 assert p.total == (10 * 1) + (6 * 2) + (4 * 3) + (2 * 5) + (1 * 8) assert p.prob(100) == 1.0 / p.total assert p.prob(200) == 0.0 assert p.prob(60) == 8.0 / p.total p.smooth() assert p.smoother(1) < 1.0 assert p.smoother(8) < 8.0 assert p.prob(1000) < p.prob(100) for event, count in reversed(sorted(p, key=lambda it: it[1])): assert p.smoother(count) < count