def minipop(request, mini_greek_metadata, mini_latin_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'minitess') conn.create_indices() for metadata in mini_greek_metadata: text = Text.json_decode(metadata) ingest_text(conn, text, enable_multitext=True) for metadata in mini_latin_metadata: text = Text.json_decode(metadata) ingest_text(conn, text, enable_multitext=True) yield conn obliterate(conn)
def minipop(request, mini_greek_metadata, mini_latin_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'minitess') for metadata in mini_greek_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) for metadata in mini_latin_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn for coll_name in conn.connection.list_collection_names(): conn.connection.drop_collection(coll_name)
def unit_tessfiles(mini_greek_metadata, mini_latin_metadata): """Create text entities for the test texts. Fixtures -------- test_data A small set of sample texts and other entities. """ tessfiles = [] for metadata in mini_greek_metadata: tessfiles.append(Text.json_decode(metadata)) for metadata in mini_latin_metadata: tessfiles.append(Text.json_decode(metadata)) tessfiles.sort(key=lambda x: x.path) return tessfiles
def g2lpop(request, mini_g2l_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'g2ltest') for metadata in mini_g2l_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn obliterate(conn)
def lucvergpop(request, lucverg_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest') for metadata in lucverg_metadata: text = Text.json_decode(metadata) tessfile = TessFile(text.path, metadata=text) conn.insert(text) tokens, tags, features = \ LatinTokenizer(conn).tokenize( tessfile.read(), text=tessfile.metadata) feature_cache = { (f.feature, f.token): f for f in conn.find(Feature.collection, language=text.language) } features_for_insert = [] features_for_update = [] for f in features: if (f.feature, f.token) not in feature_cache: features_for_insert.append(f) feature_cache[(f.feature, f.token)] = f else: f.id = feature_cache[(f.feature, f.token)].id features_for_update.append(f) conn.insert(features_for_insert) conn.update(features_for_update) unitizer = Unitizer() lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata) conn.insert_nocheck(lines) yield conn obliterate(conn)
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) conn.create_indices() with open(args.ingest) as ifh: texts = [Text.json_decode(t) for t in json.load(ifh)] for text in tqdm(texts): logger.info(f'Starting ingest: {text.author}\t{text.title}') try: ingest_text(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) # we want to catch all other errors and log them except: # noqa: E722 logger.exception(f'Failed to ingest: {text.author}\t{text.title}') logger.exception(traceback.format_exc())
def test_retrieve_units(self, request, populate): conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) m = DefaultMatcher(conn) for text in populate['texts']: text = Text.json_decode(text) start = -1 if text.language == 'latin': start = text.path.find('la/') if text.language == 'greek': start = text.path.find('grc/') if start >= 0: text.path = text.path[start:] correct = [ u for u in populate['units'] if u['text'] == text.path and u['unit_type'] == 'line' ] correct.sort(key=lambda x: x['index']) units = m.retrieve_units([text], 'line') assert len(units[0]) > 0 assert len(units[0]) == len(correct) for u in units[0]: assert u.json_encode() == correct[u.index]
def test_retrieve_frequencies(self, request, populate): conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) m = DefaultMatcher(conn) for text in populate['texts']: text = Text.json_decode(text) start = -1 if text.language == 'latin': start = text.path.find('la/') if text.language == 'greek': start = text.path.find('grc/') if start >= 0: text.path = text.path[start:] tokens = [t for t in populate['tokens'] if t['text'] == text.path] correct = [ f for f in populate['frequencies'] if f['text'] == text.path ] frequencies, _ = m.retrieve_frequencies([text], tokens, [text]) assert len(frequencies) > 0 assert len(frequencies) == len(correct) for c in correct: assert c['form'] in frequencies
def engpop(request, eng_metadata, v3checker): conn = TessMongoConnection('localhost', 27017, None, None, 'engtest') for metadata in eng_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn obliterate(conn)
def test_match(self, request, populate, reference_matches): conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) for t in populate['texts']: start = -1 if t['language'] == 'latin': start = t['path'].find('la/') if t['language'] == 'greek': start = t['path'].find('grc/') if start > 0: t['path'] = t['path'][start:] m = DefaultMatcher(conn) for ref in reference_matches: metadata = ref[0] correct = ref[1] source = [ t for t in populate['texts'] if re.search(metadata['source'], t['path']) ] target = [ t for t in populate['texts'] if re.search(metadata['target'], t['path']) ] texts = [Text.json_decode(source[0]), Text.json_decode(target[0])] matches = m.match(texts, metadata['unit'], metadata['feature'], stopwords=metadata['stopsize'], stopword_basis=metadata['stbasis'], score_basis=metadata['scorebase'], frequency_basis=metadata['freqbasis'], max_distance=metadata['max_dist'], distance_metric=metadata['dibasis']) print(matches) assert len(matches) == len(correct)
def main(): args = parse_args() if args.password: password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'utf-8') as ifh: raw_updates = json.load(ifh) connection.update([Text.json_decode(t) for t in raw_updates])
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection( db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database'] ) with open(args.ingest) as ifh: texts = [Text.json_decode(t) for t in json.load(ifh)] for text in tqdm(texts): logger.info(f'Starting ingest: {text.author}\t{text.title}') try: ingest_text(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) except: logger.exception(f'Failed to ingest: {text.author}\t{text.title}')