def g2lpop(request, mini_g2l_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'g2ltest') for metadata in mini_g2l_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn obliterate(conn)
def lucvergpop(request, lucverg_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest') for metadata in lucverg_metadata: text = Text.json_decode(metadata) tessfile = TessFile(text.path, metadata=text) conn.insert(text) tokens, tags, features = \ LatinTokenizer(conn).tokenize( tessfile.read(), text=tessfile.metadata) feature_cache = { (f.feature, f.token): f for f in conn.find(Feature.collection, language=text.language) } features_for_insert = [] features_for_update = [] for f in features: if (f.feature, f.token) not in feature_cache: features_for_insert.append(f) feature_cache[(f.feature, f.token)] = f else: f.id = feature_cache[(f.feature, f.token)].id features_for_update.append(f) conn.insert(features_for_insert) conn.update(features_for_update) unitizer = Unitizer() lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata) conn.insert_nocheck(lines) yield conn obliterate(conn)
def engpop(request, eng_metadata, v3checker): conn = TessMongoConnection('localhost', 27017, None, None, 'engtest') for metadata in eng_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn obliterate(conn)
def minipop(request, mini_greek_metadata, mini_latin_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'minitess') conn.create_indices() for metadata in mini_greek_metadata: text = Text.json_decode(metadata) ingest_text(conn, text, enable_multitext=True) for metadata in mini_latin_metadata: text = Text.json_decode(metadata) ingest_text(conn, text, enable_multitext=True) yield conn obliterate(conn)