示例#1
0
 def loadTf(self):
     C = self.C
     backend = C.backend
     org = C.org
     repo = C.repo
     version = C.data["version"]
     A = use(
         f"{org}/{repo}:clone", checkout="clone", backend=backend, version=version
     )
     self.A = A
示例#2
0
    def __init__(self, vocab_json, tf_app='bhsa', 
                 set_data=None, session_data=None,
                 resume_time=False, term_n=0, 
                 pause_times=[]):

        # set meta data for study loop (for saves)
        self.session_data = session_data
        self.set_data = set_data
        self.term_n = term_n
        self.pause_times = pause_times

        self.tf_app = tf_app
        self.fstem = vocab_json.stem # for save names
        
        # load set data
        if not set_data:
            with open(vocab_json, encoding='utf8') as setfile:
                set_data = json.load(setfile)
                self.set_data = set_data
        
        # retrieve TF app data
        appdata = set_data['app_data']
        app = appdata['app']
        datversion = appdata['version']
        self.glossfeat = appdata['gloss_feature']
        self.freqfeat = appdata['freq_feature']
        self.wordtype = appdata['wordtype']
        self.context = appdata['context']
        
        # load the app
        print('preparing TF...')
        self.TF = use(app, version=datversion, silent=True)
        self.F, self.T, self.L = self.TF.api.F, self.TF.api.T, self.TF.api.L

        # prepare for run, check cycle length
        run = self.check_end_cycle(set_data)
        if not run:
            self.save_file(set_data, vocab_json)
            raise Exception('EXIT PROGRAM INITIATED; FILE SHUFFLED AND SAVED')

        # build the study set, prep data for study session
        if session_data is None:
            self.session_data = Session(set_data)  # build session data
        self.vocab_json = vocab_json

        if resume_time:
            print(f'\nSession is resumed from {resume_time}.\n')
        
        # preliminary session report
        deck_stats = self.session_data.deck_stats
        print(set_data['name'], 'ready for study.')
        print(f"this is session {set_data['cycle_data']['total_sessions']+1}:")
        for score, stat in deck_stats.items():
            print(f'score {score}: {stat} terms')
        print(f'total: {sum(deck_stats.values())}')
示例#3
0
 def foreground(self, app, refresh=False):
     A = self.apps[app]
     hoist = self.hoist
     appSpec = app if '/' in app else f"{app}:clone"
     if A is None:
         A = use(appSpec, checkout="clone", silent="deep", hoist=hoist)
         self.apps[app] = A
     else:
         if refresh:
             A.reuse(hoist=hoist)
         else:
             A.api.makeAvailableIn(hoist)
     hoist["A"] = A
示例#4
0
 def executeApp(self):
     app = self.app
     indent(level=1, reset=True)
     info(f'BEGIN testing {app} with {len(self.queryLists[app])} queries')
     indent(level=2, reset=True)
     info(f'loading {app}')
     self.A = use(f'{app}:clone', checkout='clone', silent=True)
     info(f'making sets for {app}')
     self.makeSets()
     info(f'running queries for {app}')
     self.runQueries()
     indent(level=2)
     info(f'all queries run')
     indent(level=1)
     info(f'END testing {app}')
示例#5
0
from tf.app import use
A = use('banks:clone', checkout='clone')
T = A.api.T
T.headingFromNode(100)
示例#6
0
#
# From within Text-Fabric, we can ask for this ranking, by means of
#
# * `C.levels.data`: inspecting the precomputed data
# * `F.otype.all`: the resulting node types
# * `N.otypeRank`: the resulting ranking
#
# We load the BHSA and Uruk
# ([here](https://annotation.github.io/text-fabric/tf/about/corpora.html) is more info on these corpora)
# and have a look at their node type ranking.

As = {}

for corpus in ("bhsa", "uruk"):
    print(f"Loading {corpus} ...")
    As[corpus] = use(f"{corpus}:clone", silent="deep")
    As[corpus].info("done")

# We have loaded both datasets.
#
# We want to be able to put them into the foreground, i.e. make it so that the global variables `A N F E L T S C TF` become bound to the
# forground data set. We write a function for that.


def foreground(corpus, hoist):
    thisA = As[corpus]
    hoist["A"] = thisA
    thisTf = thisA.TF
    thisTf.makeAvailableIn(hoist)
    thisA.showContext("corpus")
示例#7
0
#
# So, go off to a terminal and give the command
#
# ```text-fabric peshitta:latest --checkout=latest```
#
# This fetches the latest version of the Peshitta app and data.
#
# After that, you can just say
#
# ```text-fabric peshitta```
#
# until you got word that a new version of app and/or data has become available.

from tf.app import use

A = use("peshitta", hoist=globals())

# ## string `JBW L` in the text

# Assuming `JBW` is a single word and L is a single word:

query = """
word word_etcbc=JBW
<: word word_etcbc=L
"""

results = A.search(query)

# That does not help. At least one of the assumptions leads to nowhere.
# At this point it might help to use the TF browser to conduct some experiments on the side line.
#
示例#8
0
from tf.app import use

query = '''
p:phrase
    =: wFirst:word
    wLast:word
    :=

wGap:word
wFirst < wGap
wLast > wGap

p || wGap

v:verse

v [[ wFirst
v [[ wGap
'''

A = use('bhsa:clone', checkout='clone')
results = A.search(query)
示例#9
0
        print(passage)
        print(passage[0])
        raise IndexError('Try using the right kind of book names bro')
    return book_to_index[passage[0]] * 1000000 + int(passage[1]) * 1000 + int(
        passage[2])


sqlFile = sys.argv[1]
jsonFile = sys.argv[2]

conn = sqlite3.connect(sqlFile)
c = conn.cursor()

# OLD: Remove checkout=local if you haven't updated the data files in a while
# Remove ":latest" to fix the rate limit thing
A = use('bhsa', hoist=globals(), checkout='local')

# def nullifyNaAndEmptyAndUnknown(list_to_reduce):
#     templist = list_to_reduce
#     keys_to_remove = set()
#     for key, value in templist.items():
#         if value == "NA" or value == "" or value == "unknown":
#             keys_to_remove.add(key)
#     for key in keys_to_remove:
#         templist[key] = None
#     return templist


def normify(word):
    return normalize('NFC', word)
示例#10
0
from utils import prs_set


def do(task):
    result = task
    md = f'''commit | release | local | base | subdir
    --- | --- | --- | --- | ---
    `{task[0]}` | `{task[1]}` | `{task[2]}` | `{task[3]}` | `{task[4]}`
    '''
    display(Markdown(md))


A = use('bhsa:latest',
        version='2017',
        mod=('cmerwich/bh-reference-system/tf'),
        hoist=globals(),
        silent=True)


def compute_text(my_book_name, from_chapter, to_chapter):

    results = []
    highlights = {}

    my_chapters = set(range(from_chapter, to_chapter + 1))

    for book in F.otype.s('book'):
        book_name = T.bookName(book)

        for chn in L.d(book, 'chapter'):
import pandas as pd
from operator import attrgetter
from tf.app import use
from tf.fabric import Fabric

VERSION = 'c'

A = use('bhsa', version=VERSION, hoist=globals(), silent=True)
TF.load('g_prs', add=True)


def PrintCoref(Corefs):
    '''
    Visualises the coreference classes that MiMi has detected. 
    `Corefs` is a list of coreference sets and singleton sets. 
    The coreference sets contain mentions, that are stored in 
    the class `Mention`.
    '''

    i = 0
    classes = []
    print('verse',
          'id',
          'mention',
          'txttyp',
          '§',
          'p',
          'g',
          'n',
          'func',
          'type',
示例#12
0
from tf.app import use

A = use('bhsa', lgc=False, check=True)
示例#13
0
# If the data is there, it will be auto-downloaded and stored on your machine.
#
# Let's do it.

# %load_ext autoreload
# %autoreload 2

# +
import collections
import os

from tf.app import use

# -

A = use("oldbabylonian:clone", checkout="clone", hoist=globals())
# A = use('oldbabylonian', hoist=globals())

# # Making data
#
# We illustrate the data creation part by creating a new feature, `ummama`.
# The idea is that we mark every sign reading that occurs between `um-ma` and `ma` some where in the first 3 lines of a face.
# We want to mark every occurrence of such signs elsewhere in the corpus with `ummama=1`.
#
# We only do it if the sign between the `um-ma` and `ma` (which must be on the same line) is not missing, damaged, or questionable.
#
# The easiest way to get started is to run a query:

query = """
line ln<4
  =: sign reading=um missing# damage# question#
示例#14
0
from collections import defaultdict
import pandas as pd
from tf.app import use

A = use(
    'bhsa', version='2017',
    hoist=globals(),
    silent=True
)

column_names=('ann_A','ann_B', 'L', 'M', 'R', 'D', 'd') 
data_types={'ann_A': str, 'ann_B': str ,'L': int, 'M': int, 'R': int, 'D': int, 'd': float}

def MakeTable(iaa_file):
    iaa_table = pd.read_table(iaa_file, 
                           delim_whitespace=True, 
                           names=column_names,
                           dtype=data_types
                          )
    return iaa_table


def ExportToLatex(output_loc, file_name, data_frame, indx = True):
    with open(f'{output_loc}{file_name}.tex','w') as texf:
        texf.write(data_frame.to_latex(index=indx))
        

def CountVersesWords():
    
    text_list = ['Psalms 138', 'Psalms 88', 'Psalms 11', 'Psalms 129', 'Psalms 70', 
             'Psalms 32', 'Psalms 20', 'Psalms 17', 'Psalms 101', 'Psalms 67',
示例#15
0
# We load the BHSA and display the example in question.
# -

# %load_ext autoreload
# %autoreload 2

# +
# pip3 install beautifulsoup4

from bs4 import BeautifulSoup as bs

from tf.app import use
from tf.advanced.helpers import dh
# -

A = use("bhsa", hoist=globals())

from ipywidgets import Text, Layout, Box, HBox, VBox, Label, HTML, Button

v1 = T.nodeFromSection(("Genesis", 1, 1))
A.pretty(v1, standardFeatures=True, fmt="text-phono-full")

# What we want is a display like this, but with the glosses (`in` `beginning` `create` etc) editable.
# Also all values after `pdp=` should be editable. And the information in the labels with clause and phrase as well
# (`xQtX`, `PP`, `Time`) etc. If you hover over them, you see they are values of features `typ`, `rela` and `function`.
#
# The task is to rebuild this from the
# [layout widgets of ipywidgets](https://ipywidgets.readthedocs.io/en/7.6.3/examples/Widget%20Styling.html),
# such as Box, HBox, VBox, HTML.
#
# We start with something simpler, the first phrase (`in beginning`), without the passage reference.
示例#16
0
        self.report('processing conjunction pairs...')
        self.conj = Conjunction(tf, **base_sets)
        self.report('\tdone')

        self.report('processing construct pairs...')
        self.cons = Construct(tf, **base_sets)
        self.report('\tdone')

    def report(self, mssg):
        if not self.silent:
            print(mssg)


# set up TF
print('Setting up Text-Fabric...')
A = use('bhsa', hoist=globals(), silent=True)
print('\tdone...')

print('\n-- RUNNING WORDSETS --\n')
wsets = WordSets(A, silent=False)
print('\n-- WSETS COMPLETE --')

print('\npickleing word sets...')
export = {
    'noms': wsets.noms,
    'preps': wsets.preps,
    'quants': wsets.quants,
    'accent_type': wsets.accents.accenttype,
    'mwords': wsets.accents.mwords,
    'conj_pairs': wsets.conj.pairs,
    'cons_pairs': wsets.cons.pairs,
示例#17
0
 def getDataFromDir():
     TF = Fabric(locations=dataDir, modules=[""])
     api = TF.loadAll()
     A = use(appFolder, api=api)
     return A
示例#18
0
import os
from sys import exit, stderr
from collections import defaultdict, Counter
from glob import glob
from pprint import pprint
from operator import itemgetter, attrgetter

import pandas as pd

from tf.app import use
from tf.fabric import Fabric
from utils import *

A = use('bhsa',
        version='2017',
        mod=('cmerwich/participant-analysis/coreference/tf:clone'),
        hoist=globals(),
        silent=True)


class ValueData:
    def __init__(self, quintuple):
        self.ct = quintuple[0]
        self.seqNum = int(quintuple[1])
        self.isSuffix = quintuple[3] == 's'
        self.wordPart = quintuple[4]
        if quintuple[2] == '':
            self.size = 1
        else:
            self.size = int(quintuple[2])
示例#19
0
# ### In notebooks
#
# This notebook is an example of how you can work with the new data.
#
# ## Using sets in queries
#
# You can use the names of sets in all places where you currently use `word`, `sign`, `face`, etc.
# More info in the [docs](https://annotation.github.io/text-fabric/tf/about/searchusage.html).

from tf.app import use
from tf.lib import readSets

A = use(
    "oldbabylonian:clone",
    version="1.0.4",
    checkout="clone",
    hoist=globals(),
    mod="annotation/tutorials/oldbabylonian/cookbook/pos/tf:clone",
)
# A = use('oldbabylonian', hoist=globals(), mod='annotation/tutorials/oldbabylonian/cookbook/pos/tf')

# Note that the features `pos` and `subpos` and friends are loaded now.
#
# Let's print the frequency lists of their values.
# First a convenience function to print the frequency list of an arbitrary feature.


def freqList(feat):
    for (p, n) in Fs(feat).freqList():
        print(f"{p:<12}: {n:>5} x")
示例#20
0
# # Cluster display in Old Babylonian
#
# We show some details of the display logic by following an example: cluster nodes in the Old Babylonian corpus.
#
# Clusters are difficult, because
#
# * they do not necessarily respect proper embedding
# * material can be part of several clusters
#
# We show how we deal with the second part and prevent multiple display of members of multiple clusters.
# As an illustration, we'll show the effect of an earlier bug and indicate the fix.
#
# We start with loading the corpus.

A = use("oldbabylonian:clone", checkout="clone", hoist=globals())

A.reuse()

# # An example line
#
# Here is a line with some nested clusters.
# In fact, it is the first line of the corpus.
#
# The node number is stored in the variable `ln`.
#
# We show the raw ATF source of the line, and the text according to several text formats.

ln = F.otype.s("line")[0]
ln
示例#21
0
def main():
    """Writes features to corpora folder."""
    use('bhsa', hoist=globals())

    data = {
        'book_idx': [],
        'book': [],
        'chapter': [],
        'verse': [],
        'clause': [],
        'word': [],
        'lexeme': [],
        'word_pos': [],
        'verbal_stem': [],
        'word_number': [],
        'verbal_tense': [],
        'clause_type': [],
        'phrase_function': [],
        'language': [],
        'type': []
    }

    all_books = {
        T.bookName(b).lower(): i
        for i, b in enumerate(F.otype.s('book'))
    }

    for book_name in all_books:
        book_idx = all_books[book_name]

        logger.info(f"Extracting {book_name}...")

        b = F.otype.s('book')[book_idx]

        for i, c in enumerate(L.d(b, 'chapter')):

            for j, v in enumerate(L.d(c, 'verse')):

                for k, cl in enumerate(L.d(v, 'clause')):

                    for l, p in enumerate(L.d(cl, 'phrase')):

                        for w in L.d(p, 'word'):

                            row_dict = {
                                'book_idx': book_idx,
                                'book': book_name,
                                'chapter': i,
                                'verse': j,
                                'clause': k,
                                'word': T.text(w).strip(),
                                'lexeme': F.lex_utf8.v(w),
                                'word_pos': F.sp.v(w),
                                'verbal_stem': F.vs.v(w),
                                'word_number': F.nu.v(w),
                                'verbal_tense': F.vt.v(w),
                                'clause_type': F.typ.v(cl),
                                'phrase_function': F.function.v(p),
                                'language': F.language.v(w),
                                'type': F.txt.v(cl),
                            }

                            data = _append_to_main_dict(data, row_dict)

    data_df = pd.DataFrame(data)

    data_df['domain'] = [typ[-1] for typ in data_df['type']]

    data_df.to_csv(os.path.join(MAIN_DIR, 'corpora', 'main_corpus.csv'),
                   index=False)
示例#22
0
import re, collections, csv
from collections import defaultdict
import pandas as pd
from anytree import Node, RenderTree, findall, findall_by_attr, find_by_attr
from tf.app import use
A = use('bhsa', hoist=globals(), mod='ch-jensen/participants/actor/tf', silent=True)

class GenerateNodes:

    def __init__(self, book, chapter):
        self.book = book
        self.chapter = chapter
    
    def nodeList(self):
        '''
        Generates a node list consisting of all phrase atom-,suphrase- and word-nodes of given book and chapter.
        '''
        chapter_node = T.nodeFromSection((self.book, self.chapter))
        phrase_atom_list = L.d(chapter_node, 'phrase_atom')

        node_list = []
        for n in phrase_atom_list:
            node_list.append(n)
            for subph in L.d(n, 'subphrase'):
                node_list.append(subph)
            for w in L.d(n, 'word'):
                node_list.append(w)

        return node_list
    
    def actorLabel(self, n, t='string'):
示例#23
0
文件: app.py 项目: annotation/banks
from tf.app import use

# We do not only load the main corpus data, but also the additional *sim* (similarity) feature that is in a
# module.
#
# For the very last version, use `hot`.
#
# For the latest release, use `latest`.
#
# If you have cloned the repos (TF app and data), use `clone`.
#
# If you do not want/need to upgrade, leave out the checkout specifiers.

A = use(
    "annotation/banks:hot",
    mod="annotation/banks/sim/tf",
    hoist=globals(),
)

# # Use the similarity edge feature
#
# We print all similar pairs of words that are at least 50% similar but not 100%.

query = """
word
<sim>50> word
"""

results = A.search(query)

A.table(results, end=10, withPassage="1 2")
示例#24
0
    'JOB': 'Iob',
    'PRO': 'Proverbia',
    'RUT': 'Ruth',
    'SNG': 'Canticum',
    'ECC': 'Ecclesiastes',
    'LAM': 'Threni',
    'EST': 'Esther',
    'DAN': 'Daniel',
    'EZR': 'Esra',
    'NEH': 'Nehemia',
    '1CH': 'Chronica I',
    '2CH': 'Chronica II'
}
#num_book_dict = {'01': 'GEN', '02': 'EXO', '03': 'LEV', '04': 'NUM', '05': 'DEU', '06': 'JOS', '07': 'JDG', '08': 'RUT', '09': '1SA', '10': '2SA', '11': '1KI', '12': '2KI', '13': '1CH', '14': '2CH', '15': 'EZR', '16': 'NEH', '17': 'EST', '18': 'JOB', '19': 'PSA', '20': 'PRO', '21': 'ECC', '22': 'SNG', '23': 'ISA', '24': 'JER', '25': 'LAM', '26': 'EZK', '27': 'DAN', '28': 'HOS', '29': 'JOL', '30': 'AMO', '31': 'OBA', '32': 'JON', '33': 'MIC', '34': 'NAM', '35': 'HAB', '36': 'ZEP', '37': 'HAG', '38': 'ZEC', '39': 'MAL'}

A = use('bhsa', hoist=globals())

for i, b in enumerate(F.otype.s('book')):
    book = T.bookName(b)
    if "_" in book:
        book = book.replace("_", " ").replace("1", "I").replace("2", "II")
    book_ptx_abrev = bhsa_book_list_with_ptx_ids[i]
    book_num = book_num_dict[book_ptx_abrev]
    book_heb = ptx_ids_bhs_names[book_ptx_abrev]
    book_latin = ptx_ids_latin_names[book_ptx_abrev]
    #if "Chron" in book:
    # print(book)
    # Write file
    filename = "SFMs\\" + book_num + book_ptx_abrev + "BHSA" + ".sfm"
    full_filename = path + filename
    with codecs.open(full_filename, "w", "utf-8") as file:
示例#25
0
import pprint as pp
from tf.app import use
import numpy as np
import random
A = use('bhsa:hot', hoist=globals())

# NAIVE BAYES FUNCTIONS START


def gen_verse(book, chapter, verse, verbose=0, lex_format="Hebrew"):
    """
    returns a list of lexemes of a selected verse
    :param lex_format: format of lexemes. "Hebrew" for Hebrew
    :param book: Book name as string
    :param chapter: number of the chapter
    :param verse:  number of the verse
    :param verbose: 1 for printing 0 for not printing the verse
    :return: returns a list of lexemes from the selected verse
    """
    indices = L.d(T.nodeFromSection((book, chapter, verse)), 'word')
    if lex_format == "Hebrew":
        verse_by_lexemes = [F.lex_utf8.v(word_idx) for word_idx in indices]
    else:
        verse_by_lexemes = [F.lex.v(word_idx) for word_idx in indices]
    if verbose == 1:
        print(verse_by_lexemes)
    return verse_by_lexemes, book


def gen_book_vocab(books, chapters=None, verses=None):
    """
import os
from sys import exit, stderr
from collections import defaultdict, Counter
from glob import glob
from pprint import pprint
from operator import itemgetter, attrgetter

import pandas as pd

from tf.app import use
from tf.fabric import Fabric
from utils import converse_pgn, suffix_dict

A = use('bhsa',
        version='2017',
        mod=('cmerwich/participant-analysis/coreference/tf,'
             'cmerwich/bh-reference-system/tf'),
        hoist=globals(),
        silent=True)


class ValueData:
    def __init__(self, quintuple):
        self.ct = quintuple[0]
        self.seqNum = int(quintuple[1])
        self.isSuffix = quintuple[3] == 's'
        self.wordPart = quintuple[4]
        if quintuple[2] == '':
            self.size = 1
        else:
            self.size = int(quintuple[2])
示例#27
0
def loadCorpus():
    A = use("bhsa", silent="deep")
    return A