def generate_vocabulary(corpus, methods, vocab_filename): if type(corpus) is not str: raise TypeError('Invalid non-string type for parameter \'corpus\'') if corpus == '': raise AttributeError('Invalid empty string for parameter \'corpus\'') if type(methods) is not list: raise TypeError('Invalid non-list type for parameter \'methods\'') if type(vocab_filename) is not str: raise TypeError( 'Invalid non-string type for parameter \'vocab_filename\'') if vocab_filename == '': raise AttributeError( 'Invalid empty string for parameter \'vocab_filename\'') corpus_filename = arg.retrieve_corpus_file(corpus, methods) if not os.path.isfile(corpus_filename): raise ValueError( f'{corpus_filename} does not exist, must generate before building vocabulary' ) is_json = check_json(corpus_filename) text_tag = arg.corpus_text_dict[corpus] if perform_hashing(methods): v = HashVocabInfo(hashing_value(methods)) # elif perform_bpe(methods): # bpe_set = create_bpe_set(corpus, methods, is_json, text_tag, bpe_value(methods)) # v = BPEVocabInfo(bpe_set) else: v = VocabInfo() #v = VocabInfo() if not perform_hashing(methods) else HashVocabInfo(hashing_value(methods)) import gzip with gzip.open(arg.retrieve_corpus_file(corpus, methods), 'rb') as f: for line in f: words = retrieve_text(line.decode('utf-8'), is_json, tag=text_tag).split() for word in set(words): v.increment_doc_frequency(word) for word in words: v.increment_term_frequency(word) with open(vocab_filename, 'wb') as f: pickle.dump(v, f, protocol=4) return v
def preprocess_corpus(corpus, treatments, offset=None, run_number=None): if corpus == '': raise AttributeError('Invalid empty corpus name') if corpus is None: raise AttributeError('Invalid corpus NoneType') if offset is not None and offset <= 0: raise AttributeError('Invalid offset value must be positive') if run_number is not None and run_number < 0: raise AttributeError('Invalid run_number value must be nonnegative') from preprocess import argmanager import os import gzip corpus_file = argmanager.retrieve_corpus_file(corpus, treatments) if os.path.isfile(corpus_file): return gzip.open(corpus_file, 'rb') if offset is None or run_number is None: raise AttributeError('Insufficient data to retrieve corpus') new_corpus_file = argmanager.retrieve_corpus_file(corpus, treatments, run_number) base_corpus_file = argmanager.retrieve_corpus_file(corpus, []) text_dict = argmanager.corpus_text_dict preprocessor = methods.create_preprocessor(treatments) extractor = retrieve_extractor(new_corpus_file)(preprocessor, tag=text_dict[corpus]) with gzip.open(new_corpus_file, 'wb') as f, gzip.open(base_corpus_file, 'rb') as g: start_line = run_number * offset end_line = start_line + offset for i, line in enumerate(g): if i < start_line: continue elif i >= end_line: break elif i >= start_line and i < end_line: line = line.decode('utf-8') f.write(f'{extractor(line)}\n'.encode('utf-8'))
def retrieve_preprocessed_corpus(corpus, methods): if type(corpus) is not str: raise TypeError('Non-string value invalid for parameter \'corpus\'') if not corpus: raise AttributeError('Parameter \'corpus\' must be non-empty') if type(methods) is not list: raise TypeError('Non-list value invalid for parameter \'methods\'') documents = list() corpus_file = arg.retrieve_corpus_file(corpus, methods) with gzip.open(corpus_file, 'rb') as f: for line in f: documents.append(line.decode('utf-8').strip('\n')) return documents
def retrieve_bpe_set(corpus, methods): bpe_set_filename = create_bpe_set_filename(corpus, methods) if os.path.isfile(bpe_set_filename): with open(bpe_set_filename, 'rb') as f: bpe_set = pickle.load(f) else: corpus_filename = arg.retrieve_corpus_file(corpus, methods) is_json = check_json(corpus_filename) text_tag = arg.corpus_text_dict[corpus] bpe_set = create_bpe_set(corpus, methods, is_json, text_tag, bpe_value(methods)) return bpe_set
def create_bpe_set(corpus, methods, is_json, tag, vocab_size, reduce_vocab=True): if type(corpus) is not str: raise TypeError('Invalid non-string type for parameter \'corpus\'') if corpus == '': raise AttributeError('Invalid empty string for parameter \'corpus\'') if type(methods) is not list: raise TypeError('Invalid non-list type for parameter \'methods\'') if type(is_json) is not bool: raise TypeError('Invalid non-bool value for parameter is_json') if type(tag) is not str: raise TypeError('Invalid non-string value for parameter tag') if type(vocab_size) is not int: raise TypeError('Invalid non-int value for parameter vocab_size') partial_vocab = 0.95 full_vocab = 1.0 import gzip import re vocab = Counter() import gzip with gzip.open(arg.retrieve_corpus_file(corpus, methods), 'rb') as f: for line in f: words = retrieve_text(line.decode('utf-8'), is_json, tag=tag).split() for word in words: vocab.update([' '.join([c for c in word])]) percentage_break = partial_vocab if reduce_vocab else full_vocab reduced_vocab = dict() total_tokens = np.sum(list(vocab.values())) tokens_so_far = int(0) for (term, frequency) in vocab.most_common(): tokens_so_far += frequency percent = tokens_so_far / total_tokens reduced_vocab[term] = frequency if percent > percentage_break: break def get_stats(vocab): pairs = defaultdict(int) for word, freq in vocab.items(): symbols = word.split() for i in range(len(symbols) - 1): pairs[symbols[i], symbols[i + 1]] += freq return pairs def merge_vocab(pair, v_in): v_out = {} bigram = re.escape(' '.join(pair)) p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') for word in v_in: w_out = p.sub(''.join(pair), word) v_out[w_out] = v_in[word] return v_out char_set = set(''.join(' '.join(reduced_vocab.keys()).split())) num_merges = vocab_size - len(char_set) for _ in range(num_merges): pairs = get_stats(reduced_vocab) best = max(pairs, key=pairs.get) reduced_vocab = merge_vocab(best, reduced_vocab) bpe_set = set(' '.join(reduced_vocab.keys()).split()) bpe_set_filename = create_bpe_set_filename(corpus, methods) with open(bpe_set_filename, 'wb') as f: pickle.dump(bpe_set, f, protocol=4) return bpe_set