示例#1
0
def main():
    """Ingest a text into Tesserae.

    Takes a .tess files and computes tokens, features, frequencies, and units.
    All computed components are inserted into the database.
    """
    args = parse_args()
    if args.password:
        password = getpass.getpass(prompt='Tesserae MongoDB Password: ')
    else:
        password = None

    connection = TessMongoConnection(args.host,
                                     args.port,
                                     args.user,
                                     password,
                                     db=args.database)

    text = Text(language=args.language,
                title=args.title,
                author=args.author,
                year=args.year,
                path=args.input,
                is_prose=args.prose)

    ingest_text(connection, text, enable_multitext=args.enable_multitext)
def populate_database(search_connection, test_data):
    """Set up the database to conduct searches on the test texts.

    Fixtures
    --------
    search_connection
        TessMongoConnection for search unit tests.
    test_data
        Example data for unit testing.
    """
    for text in test_data['texts']:
        tessfile = TessFile(text['path'], metadata=Text(**text))
        search_connection.insert(tessfile.metadata)
        if text['language'] == 'latin':
            tok = LatinTokenizer(search_connection)
        unitizer = Unitizer()
        tokens, tags, features = tok.tokenize(tessfile.read(),
                                              text=tessfile.metadata)
        search_connection.update(features)
        lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
        search_connection.insert(lines + phrases)
        search_connection.insert(tokens)

    yield

    search_connection.connection['texts'].delete_many({})
    search_connection.connection['tokens'].delete_many({})
    search_connection.connection['features'].delete_many({})
    search_connection.connection['units'].delete_many({})
    search_connection.connection['matches'].delete_many({})
    search_connection.connection['searches'].delete_many({})
示例#3
0
def test_unitize_elision_file(unit_connection, tessfiles_greek_path):
    tokenizer = GreekTokenizer(unit_connection)
    t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')),
             language='greek')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
示例#4
0
def test_unitize_notag_file(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(tessfiles_latin_path.joinpath('test.notag.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
示例#5
0
def test_unitize_linebreak_file(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(tessfiles_latin_path.joinpath('test.linebreak.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
    first_tag = phrases[0].tags[0]
    for phrase in phrases[1:]:
        assert phrase.tags[0] == first_tag
示例#6
0
def test_unitize_diacrit_in_latin(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.diacrit_in_latin.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    forms = {f.index: f.token for f in features if f.feature == 'form'}
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    for phrase in phrases:
        for t in phrase.tokens:
            cur_form = t['features']['form'][0]
            if cur_form != -1:
                normalized = tokenizer.normalize(t['display'])[0][0]
                assert normalized == forms[cur_form], phrase.snippet
示例#7
0
def test_unitize_linebreak_end(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.linebreak_end.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    print('# lines')
    for line in lines:
        print(line.snippet)
    print('# phrases')
    for phrase in phrases:
        print(phrase.snippet)
    assert len(lines) == 2
示例#8
0
def test_unitize_nopunctuation_file(unit_connection, tessfiles_latin_path):
    # when there is no ending punctuation despite coming to the end of a poem
    # and another poem starts after a blank line
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.nopunctuation.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 68
    for prev_phrase, cur_phrase in zip(phrases[:-1], phrases[1:]):
        if '2.13' in prev_phrase.tags[0] and '2.14' in cur_phrase.tags[0]:
            assert prev_phrase.snippet == 'quin et Prometheus et Pelopis parens / dulci laborem decipitur sono / nec curat Orion leones / aut timidos agitare lyncas / Eheu fugaces, Postume, Postume, / labuntur anni nec pietas moram / rugis et instanti senectae / adferet indomitaeque morti, / non, si trecenis quotquot eunt dies, / amice, places inlacrimabilem / Plutona tauris, qui ter amplum / Geryonen Tityonque tristi / conpescit unda, scilicet omnibus / quicumque terrae munere vescimur / enaviganda, sive reges / sive inopes erimus coloni. / '
            assert cur_phrase.snippet == 'frustra cruento Marte carebimus / fractisque rauci fluctibus Hadriae, / frustra per autumnos nocentem / corporibus metuemus Austrum: / '
            break
示例#9
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])

    with open(args.reingest) as ifh:
        texts = []
        for line in ifh:
            line = line.strip()
            if line:
                items = line.split('\t')
                texts.append(Text(author=items[0], title=items[1]))

    texts = conn.aggregate(Text.collection, [{
        '$match': {
            '$or': [{
                'author': t.author,
                'title': t.title
            } for t in texts]
        }
    }])

    for text in tqdm(texts):
        logger.info(f'Starting reingest: {text.author}\t{text.title}')
        try:
            reingest_text(conn, text)
        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt')
            sys.exit(1)
        # we want to catch all other errors and log them
        except:  # noqa: E722
            logger.exception(
                f'Failed to reingest: {text.author}\t{text.title}')
            logger.exception(traceback.format_exc())
示例#10
0
def units(tessfiles):
    data = []
    for root, dirs, files in os.walk(tessfiles):
        if 'new' not in root and re.search(r'poetry|prose', root):
            fdata = {}
            for fname in files:
                parts = fname.split('.')
                if '.tess' in fname:
                    metadata = Text(
                        title=parts[0],
                        author=parts[1],
                        language='greek' if 'grc' in root else 'latin',
                        path=os.path.join(root, fname))
                    fdata['metadata'] = metadata
                if '.json' in fname:
                    feature = parts[2]
                    with open(os.path.join(root, fname), 'r') as f:
                        fdata[feature] = json.load(f)
            data.append(fdata)
    return data
示例#11
0
def main():
    """Ingest a text into Tesserae.

    Takes a .tess files and computes tokens, features, frequencies, and units.
    All computed components are inserted into the database.
    """
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: ')
    else:
        password = None

    connection = TessMongoConnection(
        args.host, args.port, args.user, password, db=args.database)

    text_hash = hashlib.md5()
    text_hash.update(TessFile(args.input).read().encode())
    text_hash = text_hash.hexdigest()

    text = Text(language=args.language, title=args.title, author=args.author,
                year=args.year, path=args.input, hash=text_hash,
                is_prose=args.prose)

    ingest_text(connection, text)
示例#12
0
def insert_text(connection, cts_urn, language, author, title, year, unit_types,
                path):
    """Insert a new text into the database.

    Attempt to insert a new text in the database, sanitized to match the
    fields and data types of existing texts.

    Parameters
    ----------
    cts_urn : str
        Unique collection-level identifier.
    language : str
        Language the text is written in.
    author : str
        Full name of the text author.
    title : str
        Title of the text.
    year : int
        Year of text authorship.
    unit_types : str or list of str
        Valid unit-level delimiters for this text.
    path : str
        Path to the raw text file. May be a remote URL.

    Returns
    -------
    result : `pymongo.InsertOneResult`
        The

    Raises
    ------
    TextExistsError
        Raised when attempting to insert a text that already exists in the
        database.

    Notes
    -----
    This function should not be made available to everyone. To properly secure
    the database, ensure that only MongoDB users NOT connected to a public-
    facing client application are able to write to the database. See the
    <MongoDB documentation on role-based access control>_ for more information.

    .. _MongoDB documentation on role-based access control: https://docs.mongodb.com/manual/core/authorization/
    """
    # Attempt to load the file and any database entry with the same CTS URN
    text_file = TessFile(path)
    db_texts = retrieve_text_list(connection,
                                  cts_urn=cts_urn,
                                  hash=text_file.hash)

    # If no entries with the same CTS URN were found in the database, insert.
    # Otherwise, raise an exception.
    if len(db_texts) == 0:
        text = Text(cts_urn=cts_urn,
                    language=language,
                    author=author,
                    title=title,
                    year=year,
                    unit_types=unit_types,
                    path=path,
                    hash=text_file.hash)
        result = connection.texts.insert_one(text.json_encode(exclude=['_id']))
        return result
    else:
        raise TextExistsError(cts_urn, hash)