def test_md_split_1(): doc = articles()[0] paras = CleanText(doc) \ .markdown_split_paragraphs() \ .value() assert len(paras) > 1 print(paras)
def test_md_split_all(): docs = articles() paras = CleanText(docs)\ .markdown_split_paragraphs()\ .value() assert len(paras) > 0 assert len(docs) < len(paras) print(paras)
def gen_entries(self): try: # the generates article-fixtures on the GPU container, which then become available in /storage # to the server container. So run tests on GPU first, then on esrver. TODO decouple this! from ml_tools.fixtures import articles except: raise Exception("Can't generate entries from server container, must do from GPU container first.") entries = articles(group_by='paragraph') entries = Box({ k: dict(text=v, paras=v.split('\n\n')) for k, v in entries.items() }) self.save("entries", entries) return entries
def test_normalize(fmt, coverage, mode): chain = CleanText(articles(fmt=fmt)) if coverage == "basic": chain = chain.keywords(mode=mode) else: # Revisit this list as cleantext.py grows chain = chain\ .unmark()\ .strip_html()\ .normalize_numbers()\ .fix_punct()\ .only_english()\ .only_ascii()\ .remove_apos()\ .multiple_whitespace()\ .keywords(mode=mode) clean = chain.join().value() assert len(chain.data.lemmas) > 10 print(chain.data.lemmas[:5]) assert len(clean) > 10 print(clean[0])
from ml_tools import Similars from ml_tools.fixtures import articles import numpy as np corpus = articles() def test_ae(): chain = Similars(corpus).embed() vecs = chain.value() orig_cosines = chain.normalize().cosine().value() orig_cosines = np.argsort(orig_cosines, axis=1) dims = 20 reduced = chain.autoencode(dims=[400, 20]).value() assert vecs.shape[0] == reduced.shape[0] assert reduced.shape[1] == dims[-1] # TODO do some comparison between original cosines & new cosines
def test_group_none(): res = articles() assert len(res) > 10 assert type(res[0]) == str print(res[0])
def test_group_paragraph(): res = articles(group_by='paragraph') assert len(res.keys()) > 10 assert type(res.vr_0) == str
def test_group_article(): res = articles(group_by='article') assert len(res.vr) > 10 assert type(res.vr[0]) == str
from box import Box from ml_tools import CosineEstimator, Similars from ml_tools.fixtures import articles import numpy as np import pandas as pd import optuna import argparse parser = argparse.ArgumentParser() parser.add_argument('--jobs', help='Number of threads', default=1) parser.add_argument('--init', action='store_true', help='initialize starter trials') args_p = parser.parse_args() lhs = articles() lhs = Similars(lhs).embed().cluster(algo='agglomorative').value() rhs = np.load('/storage/libgen/testing.npy') #, mmap_mode='r') books = pd.read_feather('/storage/libgen/testing.df') # don't use cook(.?book)? , it's used in too many programming books food_re = "gluten.?free|vegan|vegetarian" # these should be really specific (think about edge-cases) votes = Box( mine_up=r"(tensorflow|keras)", other_up=rf"({food_re}|republican)", mine_down=rf"({food_re})", other_down=r"(artificial|\bai\b|python|java|css|html|cbt|cognitive.?behav)" )