def common_crawl_unsupervised(path, k=None, one_sent=False): """ Prepares file for unsupervised learning based on the Common Crawl WARC file. :param path: path to the WARC file :param k: keep only first <k> training samples :param one_sent: one sentence per line needed """ data = read_warc(path, clean_html=True) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') tmp = os.path.split(path) opath = os.path.join(tmp[0], os.path.splitext(tmp[1])[0] + '.txt') samples = 0 out = [] for el in tqdm(data, mininterval=1.0): if k is not None and samples >= k: break if one_sent: content = [] for entry in el[1]: content += tokenizer.tokenize(entry) else: content = el[1] content = tokenize_sentences(content) samples += len(content) out += content for i in range(len(out)): out[i] += '\n' with open(opath, 'w+') as f: f.writelines(out)
def alquist_unsupervised(path, preprocess=False): """ Prepares file for unsupervised learning based on the Alquist csv file. :param path: path to the Alquist train file :param preprocess: lowercase and tokenize the sentences with Tweet tokenizer """ data = load_alquist(path) sentences = data['X'] tmp = os.path.split(path) if preprocess: sentences = tokenize_sentences(sentences) opath = os.path.join(tmp[0], 'unsupervised_training', os.path.splitext(tmp[1])[0] + '-prep.txt') else: opath = os.path.join(tmp[0], 'unsupervised_training', os.path.splitext(tmp[1])[0] + '.txt') endings = '.!?' for i, s in enumerate(sentences): s = ' '.join(s.split()) if s[-1] not in endings: end = ' .' if preprocess else '.' s = s + end sentences[i] = s + '\n' with open(opath, 'w+') as f: f.writelines(sentences)
def classify_sentences(self, sentences): """ Classify sentences into classes trained in the FastText model. :param sentences: list of sentences to classify :return: list of classes """ labels = self.model.predict(tokenize_sentences(sentences))[0] return [w[0].replace('__label__', '') for w in labels]
def transform_sentences(self, sentences): """ Transform a list of sentences into vector representation. :param sentences: list of sentences :return: list of numpy vectors """ return [ self.model.embed_sentence(s) for s in tokenize_sentences(sentences) ]
def sts_starspace(path, mode='train'): """ Prepares STS Benchmark files in a format needed by StarSpace trainMode 3. :param path: path to the folder where STS files are located :param mode: train / dev / test """ assert mode in ['train', 'dev', 'test'] data = load_sts(os.path.join(path, f'sts-{mode}.csv'), lower=True) x1 = tokenize_sentences(data['X1']) x2 = tokenize_sentences(data['X2']) out = [] for s1, s2, y in zip(x1, x2, data['y']): if mode not in ['train', 'dev'] or y > 4: out.append(convert_numbers(s1) + '\t' + convert_numbers(s2) + '\n') with open(os.path.join(path, f'starspace/sts-{mode}.txt'), 'w+') as f: f.writelines(out)
def alquist_starspace(path): """ Prepares Alquist files in a format needed by StarSpace. :param path: path to Alquist train file """ data = load_alquist(path) sentences = data['X'] intents = data['y'] sentences = tokenize_sentences(sentences) out = [] for s, i in zip(sentences, intents): out.append(s + '\t' + '__label__' + i + '\n') tmp = os.path.split(path) opath = os.path.join(tmp[0], 'StarSpace_preprocessed', os.path.splitext(tmp[1])[0] + '.txt') with open(opath, 'w+') as f: f.writelines(out)
def sts_unsupervised(path, preprocess=False): """ Prepares file for unsupervised learning based on the STS csv file. :param path: path to the STS train file :param preprocess: lowercase and tokenize the sentences with Tweet tokenizer """ data = load_sts(path) sentences = data['X1'] + data['X2'] tmp = os.path.split(path) if preprocess: sentences = tokenize_sentences(sentences) opath = os.path.join(tmp[0], 'unsupervised_training', os.path.splitext(tmp[1])[0] + '-prep.txt') else: opath = os.path.join(tmp[0], 'unsupervised_training', os.path.splitext(tmp[1])[0] + '.txt') for i in range(len(sentences)): sentences[i] += '\n' with open(opath, 'w+') as f: f.writelines(sentences)
for rects in [rects1, rects2]: for rect in rects: height = rect.get_height() ax.text(rect.get_x() + rect.get_width()/2 - 0.1, height, '%d' % int(height), ha='left', va='bottom', rotation=70) DATA_PATH = DATA_DIR + 'alquist/dm-data-snapshot-uniq.csv' STOPWORDS = stopwords.words('english') PUNCTUATION = ['.', '?', '!'] # BLACKLIST = ['no', 'yes', 'yeah', 'okay', 'sure', 'right'] BLACKLIST = [] if __name__ == '__main__': sents, intents = load_file_raw(DATA_PATH) sents = tokenize_sentences(sents) int_uniq = sorted(set(intents)) print('\n===SENTENCES (full / unique)===') cnt = Counter(intents) cnt_uniq = count_unique_sentences(sents, intents) print('Total:', len(sents), '/', cnt_uniq['total']) c1, c2 = [], [] for intent in int_uniq: c1.append(cnt[intent]) c2.append(cnt_uniq[intent]) print(intent+':', c1[-1], '/', c2[-1]) plot_bar(c1, c2, int_uniq, f"Sentence counts (total: {len(sents)} / {cnt_uniq['total']})", ('full', 'unique')) print('\n===VOCABULARY (full / without stopwords)===') voc = build_vocabulary(sents, remove=PUNCTUATION)
def compress(emb_path, emb_dim=300, prune_freq=None, prune_norm=None, trn_path=None, reduce_dim=None, quantize=False, normalize=False, distinct=False, d_sv=5, d_cb=256, qnt_trn=10000, out_name='compressed', pickle_output=False, precision=5): """ Main model compression function. :param emb_path: path to the embedding model :param emb_dim: input embedding dimension :param prune_freq: number of words to keep after pruning by vector frequency :param prune_norm: number of words to keep after pruning by vector norm :param trn_path: path to a training file - keep words present in this file :param reduce_dim: embedding dimension after dimensionality reduction :param quantize: use vector quantization :param normalize: normalize the vectors to unit length before quantization :param distinct: create a distinct codebook for each sub-vector position :param d_sv: size of sub-vectors the embeddings are split into :param d_cb: codebook size :param qnt_trn: maximum number of randomly picked vectors for computing the codebook :param out_name: name of the output model (without extension) :param pickle_output: create also a pickled version of the quantized model :param precision: maximum number of decimals used in the output model """ if not quantize: normalize, distinct = False, False if reduce_dim is not None and reduce_dim >= emb_dim: reduce_dim = None out = f'{out_name}.txt' out_cb = f'{out_name}.cb.txt' trn_words = None if trn_path: trn_words = [] with open(trn_path) as f: for line in f: trn_words += line.strip().split() trn_words = set(trn_words) print('Loading data (+ pruning vocabulary by frequency)...') if emb_path.endswith('.bin'): vocab, vecs, sizes = load_model_ft_bin(emb_path, k=prune_freq, normalize=normalize, keep=trn_words) else: vocab, vecs, sizes = load_model_txt(emb_path, k=prune_freq, normalize=normalize, dim=emb_dim, header=True, keep=trn_words) if prune_norm: # ToDo: Possibility to prune by any training set, not just STS. print('Pruning vocabulary by norm...') sts = load_sts(DATA_DIR + 'stsbenchmark/sts-train.csv') sts = tokenize_sentences(sts['X1'] + sts['X2'], to_lower=True) vocab, vecs, sizes = prune_by_norm(vocab, vecs, sizes, trn=sts, keep=prune_norm) # vocab, vecs, sizes = prune_by_trn(vocab, vecs, sizes, trn=sts) print('- pruned vocabulary size:', len(vocab)) if reduce_dim: print('Reducing dimension...') emb_dim = reduce_dim # pca = PCA(n_components=reduce_dim, copy=False) # vecs = pca.fit_transform(vecs) vecs = vecs[:, :reduce_dim] if quantize: print('Computing codebook...') cb_out = [] lbg_data = split_vecs(vecs, n=d_sv, limit=qnt_trn, distinct=distinct) if distinct: cb = dict() for pos in lbg_data: print('--- position:', pos, '---') cb[pos] = generate_codebook(lbg_data[pos], cb_size=d_cb)[0] for pos in cb: codebook_to_strings(cb[pos].round(precision), cb_out) else: cb = generate_codebook(lbg_data, cb_size=d_cb)[0] codebook_to_strings(cb.round(precision), cb_out) print('Writing codebook...') with open(out_cb, 'w', encoding='utf-8') as file: header = f'{d_cb} {d_sv}\n' file.write(header) file.writelines(cb_out) print('Quantizing vectors...') convert_func = convert_vec_distinct if distinct else convert_vec vecs = np.asarray([convert_func(vec, d_sv, cb) for vec in vecs]) print('Preparing compressed model...') emb_out = [] if not quantize: vecs = vecs.round(precision) for idx, word in enumerate(vocab): s = word for num in vecs[idx]: s += f' {num}' if normalize: s += f' {round(sizes[idx], precision)}' emb_out.append(s + '\n') print('Writing compressed model...') dim = int(emb_dim / d_sv) if quantize else emb_dim with open(out, 'w', encoding='utf-8') as file: header = f'{len(emb_out)} {dim}' if normalize: header += ' NORM' if distinct: header += ' DIST' header += '\n' file.write(header) file.writelines(emb_out) if pickle_output and quantize: print('Pickling...') pickle_compressed_model(out, out_cb, f'{out_name}.pickle')
else: trn_words = None print('Loading data (+ pruning vocabulary by frequency)...') if params.emb_path.endswith('.bin'): vocab, vecs, sizes = load_model_ft_bin(params.emb_path, k=params.prune_freq, normalize=params.normalize, keep=trn_words) else: vocab, vecs, sizes = load_model_txt(params.emb_path, k=params.prune_freq, normalize=params.normalize, dim=params.emb_dim, header=True, keep=trn_words) if params.prune_norm: # TODO: Possibility to prune by any training set, not just STS. print('Pruning vocabulary by norm...') sts = load_sts('data/stsbenchmark/sts-train.csv') sts = tokenize_sentences(sts['X1'] + sts['X2'], to_lower=True) vocab, vecs, sizes = prune_by_norm(vocab, vecs, sizes, trn=sts, keep=params.prune_norm) # vocab, vecs, sizes = prune_by_trn(vocab, vecs, sizes, trn=sts) print('- pruned vocabulary size:', len(vocab)) if params.reduce_dim: print('Reducing dimension...') params.emb_dim = params.dim # pca = PCA(n_components=params.dim, copy=False) # vecs = pca.fit_transform(vecs) vecs = vecs[:, :params.dim] if params.quantize: # TODO: Quantize also the vector sizes after normalization? print('Computing codebook...') cb_out = []