예제 #1
0
파일: tfFromSyrnt.py 프로젝트: ETCBC/syrnt
def loadTf():
    print(f'Load TF dataset for the first time')
    TF = Fabric(locations=TF_PATH, modules=[''])
    api = TF.load('')
    allFeatures = TF.explore(silent=False, show=True)
    loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
    TF.load(loadableFeatures, add=True)
    return api

    print('All done')
예제 #2
0
파일: tfFromMorph.py 프로젝트: q-ran/quran
def loadTf():
    TF = Fabric(locations=[OUT_DIR])
    allFeatures = TF.explore(silent=True, show=True)
    loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
    api = TF.load(loadableFeatures, silent=False)
    if api:
        print(f'max node = {api.F.otype.maxNode}')
        print(api.F.root.freqList()[0:20])
예제 #3
0
def loadTf():
    TF = Fabric(locations=[f'{TF_PATH}/{VERSION}'])
    allFeatures = TF.explore(silent=True, show=True)
    loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
    api = TF.load(loadableFeatures)
    if api:
        print(f'max node = {api.F.otype.maxNode}')
        print(api.F.root.freqList()[0:20])
예제 #4
0
def loadTf(outDir):
    TF = Fabric(locations=[outDir])
    allFeatures = TF.explore(silent=True, show=True)
    loadableFeatures = allFeatures["nodes"] + allFeatures["edges"]
    api = TF.load(loadableFeatures, silent=False)
    if api:
        print(f"max node = {api.F.otype.maxNode}")
        print("Frequencies of words")
        for (word, n) in api.F.letters.freqList()[0:20]:
            print(f"{n:>6} x {word}")
예제 #5
0
def gather(locations, modules):
    TF = Fabric(locations=locations, modules=modules, silent=True)
    api = TF.load(FEATURES, silent=True)

    for node in api.F.otype.s('book'):
        book = api.T.sectionFromNode(node)[0]
        print(book)
        dump_book(api, book)

    with open(os.path.join(DATADIR, 'verse_nodes.pkl'), 'wb') as f:
        pickle.dump(VERSE_NODES, f)
예제 #6
0
    def load_tf(self):
        '''
        Loads an instance of TF if necessary.
        '''

        # load BHSA Hebrew data
        TF = Fabric(bhsa_data_paths, silent=True)
        tf_api = TF.load('''
                        function lex vs language
                        pdp freq_lex gloss domain ls
                        heads prep_obj mother rela
                        typ sp sem_domain sem_domain_code
                      ''',
                         silent=True)

        self.tf_api = tf_api
예제 #7
0
    def load_tf_bhsa(self):
        '''
        Loads a TF instance of the BHSA dataset.
        '''
        TF = Fabric(
            locations='~/github',
            modules=['etcbc/bhsa/tf/c', 'semantics/phase1/tf/c'
                     ],  # modify paths here for your system
            silent=True)
        api = TF.load('''
                        book chapter verse
                        function lex vs language
                        pdp freq_lex gloss domain ls
                        heads
                      ''',
                      silent=True)

        B = Bhsa(api, '4. Semantic Space Construction', version='c')

        return api, B
예제 #8
0
def main():
    TF = Fabric(modules=['hebrew/etcbc4c'],
                locations='~/VersionControl/etcbc-data',
                silent=True)
    api = TF.load('language g_word_utf8 lex_utf8 vs vt gn nu ps', silent=True)
    api.makeAvailableIn(globals())

    data = Databank()

    for n in N():
        try:
            handle(n, data)
        except (KeyError, ValueError):
            pass

    print(len(data.verbs), len(data.roots))

    with open('etcbc-verbs.csv', 'w') as csvverbs:
        verbwr = csv.writer(csvverbs, quoting=csv.QUOTE_MINIMAL)
        #verbwr.writerow(['id', 'verb','root','stem','tense','person','gender','number','active'])
        i = VERB_STARTID
        for verb in data.verbs:
            verbwr.writerow([
                i, verb.verb, verb.root, verb.stem, verb.tense,
                verb.person if verb.person is not None else 'NULL',
                verb.gender if verb.gender is not None else 'NULL',
                verb.number if verb.number is not None else 'NULL', 1
            ])
            i += 1

    with open('etcbc-roots.csv', 'w') as csvroots:
        rootwr = csv.writer(csvroots, quoting=csv.QUOTE_MINIMAL)
        #rootwr.writerow(['id', 'root', 'root_kind_id'])
        i = ROOT_STARTID
        for root in data.roots:
            rootwr.writerow([i, root.lex, 1])
            i += 1
예제 #9
0
from tf.fabric import Fabric
import collections
import sys
# https://etcbc.github.io/bhsa/features/hebrew/4b/features/comments/g_lex_utf8.html

TF = Fabric(locations='/home/chaim/github/text-fabric-data',
            modules=['hebrew/etcbc4c'])
#TF = Fabric(locations='c:/josh/text-fabric-data/text-fabric-data', modules=['hebrew/etcbc4c'])
api = TF.load(
    'sp lex g_word g_word_utf8 trailer_utf8 ls typ rela function qere_utf8 qere'
)
api.makeAvailableIn(globals())

F = api.F
T = api.T
C = api.C
L = api.L

#print(sorted(T.formats))


def print_original_words():

    for i in range(1, 12):
        print(api.T.text([i], 'text-orig-full'))


# for w in F.otype.s('word'):
#     word, part_of_speech = F.g_word.v(w), F.sp.v(w)
#     print(word, part_of_speech)
#     if w == 14:
예제 #10
0
파일: coreData.py 프로젝트: cmerwich/bhsa
#
# Just to see whether everything loads and the precomputing of extra information works out.
# Moreover, if you want to work with these features, then the precomputing has already been done, and everything is quicker in subsequent runs.
#
# We issue load statement to trigger the precomputing of extra data.
# Note that all features specified text formats in the `otext` config feature,
# will be loaded, as well as the features for sections.
#
# At that point we have access to the full list of features.
# We grab them and are going to load them all!

# In[5]:

utils.caption(4, 'Load and compile standard TF features')
TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('')

utils.caption(4, 'Load and compile all other TF features')
allFeatures = TF.explore(silent=False, show=True)
loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
api = TF.load(loadableFeatures)
api.makeAvailableIn(globals())

# # Examples

# In[12]:

utils.caption(4, 'Basic test')
utils.caption(4, 'First verse in all formats')
for fmt in T.formats:
    utils.caption(0, '{}'.format(fmt), continuation=True)
예제 #11
0
# In[14]:

utils.caption(4, "Load the existing TF dataset")
TF = Fabric(locations=[coreTf, thisTf], modules=[""])

# We instruct the API to load data.

# In[8]:

# In[15]:

api = TF.load("""
    function rela typ
    g_word_utf8 trailer_utf8
    lex prs uvf sp pdp ls vs vt nametype gloss
    book chapter verse label number
    s_manual f_correction
    valence predication grammatical original lexical semantic
    mother
""")
api.makeAvailableIn(globals())

# # Indicators
#
# Here we specify by what features we recognize key constituents.
# We use predominantly features that come from the correction/enrichment workflow.

# In[9]:

# pf ... : predication feature
# gf_... : grammatical feature
예제 #12
0
파일: lexicon.py 프로젝트: OsvaldoJ/bhsa
    otextInfo = dict(line[1:].split('=', 1)
                     for line in LEX_FORMATS.strip('\n').split('\n'))
    for x in sorted(otextInfo.items()):
        utils.caption(0, '{:<30} = "{}"'.format(*x))

# # Lexicon preparation
# We add lexical data.
# The lexical data will not be added as features of words, but as features of lexemes.
# The lexemes will be added as fresh nodes, of a new type `lex`.

# In[8]:

utils.caption(4, 'Load the existing TF dataset')
TF = Fabric(locations=thisTf, modules=[''])
vocLex = ' g_voc_lex g_voc_lex_utf8 ' if DO_VOCALIZED_LEXEME else ''
api = TF.load('lex lex_utf8 language sp ls gn ps nu st oslots {} {}'.format(
    vocLex, EXTRA_OVERLAP))
api.makeAvailableIn(globals())

# # Text pass
# We map the values in the language feature to standardized ISO values: `arc` and `hbo`.
# We run over all word occurrences, grab the language and lexeme identifier, and create for each
# unique pair a new lexeme node.
#
# We remember the mapping between nodes and lexemes.
#
# This stage does not yet involve the lexical files.

# In[9]:

utils.caption(4, 'Collect lexemes from the text')
예제 #13
0
if thisOtext is '':
    utils.caption(0, 'No additional text formats provided')
    otextInfo = {}
else:
    utils.caption(0, 'New text formats')
    otextInfo = dict(line[1:].split('=', 1) for line in thisOtext.strip('\n').split('\n'))
    for x in sorted(otextInfo.items()):
        utils.caption(0, '{:<30} = "{}"'.format(*x))


# In[7]:


utils.caption(4, 'Load the existing TF dataset')
TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('label g_word g_cons trailer_utf8')
api.makeAvailableIn(globals())


# # Verse labels
# The ketiv-qere files deal with different verse labels.
# We make a mapping between verse labels and nodes.

# In[8]:


utils.caption(0, 'Mapping between verse labels and verse nodes')
nodeFromLabel = {}
for vs in F.otype.s('verse'):
    lab = F.label.v(vs)
    nodeFromLabel[lab] = vs
예제 #14
0
import json
import codecs
import csv
# from flask import request

from sblgnt_back.controller import translate as tr
from sblgnt_back.lib import vcodeparser as vp

SBLGNT = 'sblgnt'
TG = Fabric(modules=SBLGNT, silent=False)

gnt = TG.load('''
    book chapter verse
    g_word trailer
    otext otype psp
    Case Gender Mood Number
    Person Tense Voice
    UnicodeLemma gloss strong
    transliteration ClType function
''')


# 번역본 텍스트 불러오기
#자체 json 파일로 번역본 인용
def json_to_verse(ver, book, chp, bib):
    path = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))

    if bib == "old":
        book_code = {
            "Genesis": 0,
            "Exodus": 1,
예제 #15
0
sys.path.append('scripts')
from build_tables import build_sample_tables

# fire up Text-Fabric with BHSA data
TF = Fabric(snakemake.input['tf_mods'], silent='deep')
features = """
sp pdp vs vt ps gn nu
lex language gloss voc_lex voc_lex_utf8
function number label 
typ code rela mother domain txt 
genre
sense
nhead
funct_assoc
"""
bhsa = TF.load(features, silent='deep')
F, E, T, L, Fs, = bhsa.F, bhsa.E, bhsa.T, bhsa.L, bhsa.Fs

# load GBI Hebrew data
with open(snakemake.input.bhsa2gbi, 'rb') as infile:
    bhsa2gbi = pickle.load(infile)

# preprocess data
bookmap = get_book_maps(bhsa)
loca_lexs = get_loca_assocs(bhsa)


def join_on(nodes, jchar='_', default=''):
    """Join words on a char and ensure they are pre/appended with that char.
    
    The pre/appending provides easy-to-match word boundaries.
예제 #16
0
if SCRIPT:
    (good,
     work) = utils.mustRun(None,
                           '{}/.tf/{}.tfx'.format(thisTf, newFeatures[0]),
                           force=FORCE)
    if not good: stop(good=False)
    if not work: stop(good=True)

# # Load existing data

# In[17]:

utils.caption(4, 'Loading relevant features')

TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('book')
api.makeAvailableIn(globals())

nodeFeatures = {}
nodeFeatures['book@la'] = {}

bookNodes = []
for b in F.otype.s('book'):
    bookNodes.append(b)
    nodeFeatures['book@la'][b] = F.book.v(b)

for (langCode, langBookNames) in bookNames.items():
    nodeFeatures['book@{}'.format(langCode)] = dict(
        zip(bookNodes, langBookNames))
utils.caption(0, '{} book name features created'.format(len(nodeFeatures)))
예제 #17
0
def ingest_french(paths):
    """Match the French data to our dataset."""

    # load the French dataset
    with open(paths['source'], 'r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        french_data = list(reader)

    # load the BHSA Hebrew data for matching the Hebrew text
    TF = Fabric(locations=paths['bhsa'])
    API = TF.load('g_word_utf8')
    F, T, L = API.F, API.T, API.L

    # match the Hebrew verbs in the French data with the
    # Hebrew verbs in BHSA
    # we treat the ref strings as unique ID's
    # we use 2 dicts; one to hold ID 2 BHSA node mappings
    # another to hold the IDs 2 french data
    french2bhsa = {}
    french2data = {}
    frenchverses = {}

    for row in french_data:

        # parse French data
        wid = row[0]
        hb_txt, hb_lex, hb_tag, hb_prev = row[1:5]
        fr_words, fr_verse = row[5:7]
        bk, ch, vs, sg, wnum = parse_refstring(wid)
        french2data[wid] = {
            'wid': wid,
            'hebrew': hb_txt,
            'hebrew_parse': hb_tag,
            'french': fr_words,
        }

        # look up BHSA data and get the verse node
        tf_book = int2book[bk]
        vrs_node = T.nodeFromSection((tf_book, ch, vs))
        if vrs_node is None:
            raise Exception((tf_book, ch, vs), wid, hb_txt)

        # save the French verse text
        ref_string = str((tf_book, ch, vs))
        frenchverses[ref_string] = fr_verse
        french2data[wid]['ref'] = ref_string

        # get the closest matching word from the verse;
        # NB we iterate over the verse words in reversed order
        # so that if there are 2+ words with equivalent distances,
        # we always end on the one that is first in the verse;
        # the match is then added to a set so that it is not
        # available for subsequent matches
        french2bhsa[wid] = BhsaWord(0, float('inf'))  # initialize with dummy
        matched = set()
        for word_node in reversed(L.d(vrs_node, 'word')):
            if word_node in matched:
                continue
            bhsa_txt = T.text(word_node)
            dist = levdist(bhsa_txt, hb_txt)
            if french2bhsa[wid].dist > dist:
                french2bhsa[wid] = BhsaWord(word_node, dist)
        matched.add(french2bhsa[wid].node)

    # iterate over both french dicts and assemble
    # into one BHSA dict
    bhsa2french = {}
    for wid, bhsa_word in french2bhsa.items():
        bhsa_node = bhsa_word.node
        if bhsa_node != 0:
            bhsa2french[bhsa_node] = french2data[wid]

    # the linking is complete
    with open(paths['out'], 'w') as outfile:
        json.dump(bhsa2french, outfile, indent=2, ensure_ascii=False)

    with open(paths['out_verses'], 'w') as outfile:
        json.dump(frenchverses, outfile, indent=2, ensure_ascii=False)
예제 #18
0
파일: paragraphs.py 프로젝트: OsvaldoJ/bhsa
provenanceMetadata = dict(
    dataset='BHSA',
    datasetName='Biblia Hebraica Stuttgartensia Amstelodamensis',
    version=VERSION,
    author='Eep Talstra Centre for Bible and Computer',
    encoders='Constantijn Sikkel (QDF), and Dirk Roorda (TF)',
    website='https://shebanq.ancient-data.org',
    email='*****@*****.**',
)

# In[7]:

utils.caption(4, 'Load the existing TF dataset')
TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('label number')
api.makeAvailableIn(globals())

# # Clause atom identifiers in .px
# We must map the way the clause_atoms are identified in the `.px` files
# to nodes in TF.

# In[8]:

utils.caption(0, '\tLabeling clause_atoms')

labelNumberFromNode = {}
nodeFromLabelNumber = {}
for n in N():
    otype = F.otype.v(n)
    if otype == 'book':
from tf.fabric import Fabric
import collections
import sys
# https://etcbc.github.io/bhsa/features/hebrew/4b/features/comments/g_lex_utf8.html

#TF = Fabric(locations='/home/chaim/github/text-fabric-data', modules=['hebrew/etcbc4c'])
TF = Fabric(locations='c:/josh/text-fabric-data/text-fabric-data',
            modules=['hebrew/etcbc4c'])
api = TF.load('qere_utf8 qere lex0 g_word_utf8 g_word')
api.makeAvailableIn(globals())

F = api.F
T = api.T
C = api.C
L = api.L


def print_original_words():

    for i in range(1, 12):
        print(api.T.text([i], 'text-orig-full'))


# for w in F.otype.s('word'):
#     word, part_of_speech = F.g_word.v(w), F.sp.v(w)
#     print(word, part_of_speech)
#     if w == 14:
#         break

import sys
예제 #20
0
     work) = utils.mustRun(None,
                           '{}/.tf/{}.tfx'.format(thisTf, newFeatures[0]),
                           force=FORCE)
    if not good: stop(good=False)
    if not work: stop(good=True)

# # Collect
#
# We collect the statistics.

# In[6]:

utils.caption(4, 'Loading relevant features')

TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('{} {} {}'.format(LANG_FEATURE, LEX_FEATURE, OCC_FEATURE))
api.makeAvailableIn(globals())

hasLex = 'lex' in set(F.otype.all)

# In[7]:

utils.caption(0, 'Counting occurrences')
wstats = {
    'freqs': {
        'lex': collections.defaultdict(lambda: collections.Counter()),
        'occ': collections.defaultdict(lambda: collections.Counter()),
    },
    'ranks': {
        'lex': collections.defaultdict(lambda: {}),
        'occ': collections.defaultdict(lambda: {}),
예제 #21
0
파일: index.py 프로젝트: jcuenod/lafwebpy
from tf.fabric import Fabric

### set up app - we're going to use it for gzip middleware ###

app = Bottle()

### load up TF ###

TF = Fabric(locations='../text-fabric-data', modules='hebrew/etcbc4c')
api = TF.load('''
	book chapter verse
	sp nu gn ps vt vs st
	otype
	det
	g_word_utf8 trailer_utf8
	lex_utf8 lex voc_utf8
	g_prs_utf8 g_uvf_utf8
	prs_gn prs_nu prs_ps g_cons_utf8
	gloss sdbh lxxlexeme
	accent accent_quality
	tab typ
''')
api.makeAvailableIn(globals())

### WORD API ###


def remove_na_and_empty_and_unknown(list_to_reduce):
    templist = list_to_reduce
    keys_to_remove = set()
    for key, value in templist.items():
예제 #22
0
import json
import pickle
import collections
from tf.fabric import Fabric
from pathlib import Path
from verb_form import get_verbform

# load basic BHSA data with Text-Fabric
TF = Fabric(snakemake.input, silent='deep')
bhsa = TF.load('pdp lex vt language', silent='deep')
F, L = bhsa.F, bhsa.L

# load GBI data for verb_form creation
with open(snakemake.input.bhsa2gbi, 'rb') as infile:
    bhsa2gbi = pickle.load(infile)

# loop through all verbs stored in the BHSA
# and select those forms specified by the wildcard
samples = []
for node in F.pdp.s('verb'):

    # skip non-hebrew words
    if F.language.v(node) != 'Hebrew':
        continue 

    verb_form = get_verbform(node, bhsa, bhsa2gbi)
    get_form = snakemake.wildcards.verb
    
    # handle cohortatives / jussives
    if get_form == 'yqtl' and verb_form in {'jussM', 'cohoM'}:
        samples.append(node)
예제 #23
0
# @app.after_request
# def set_response_headers(r):
#     r.headers['Cache-Control'] = 'public, max-age=3600'
#     return r

### Load up TF ###
ETCBC = 'hebrew/etcbc4c'
TF = Fabric(locations='text-fabric-data', modules=ETCBC)
#api = TF.load('book')

api = TF.load('''
    book chapter verse
    nu gn ps vt vs st
    otype typ function
    det pdp qere_utf8 qere_trailer_utf8
    g_word_utf8 trailer_utf8
    lex_utf8 lex voc_utf8
    g_prs_utf8 g_uvf_utf8
    prs_gn prs_nu prs_ps g_cons_utf8
    gloss phono 
''')

api.makeAvailableIn(globals())

# kml 파일 관련
book_abb = {
    "Genesis": "gen",
    "Exodus": "exod",
    "Leviticus": "lev",
    "Numbers": "num",
    "Deuteronomy": "deut",
예제 #24
0
import sys
import unittest

from tf.fabric import Fabric

# LOAD THE TEST CORPUS

TF = Fabric('tf')
api = TF.load('sign name')
F = api.F
S = api.S

# MAKE CUSTOM SETS OF NODES

Sign = set(range(1, F.otype.maxSlot + 1))
Node = set(range(1, F.otype.maxNode + 1))

sets = dict(Sign=Sign, Node=Node)

# RUN A QUERY, OPTIONALLY WITH CUSTOM SETS


def query(template, sets=None):

    return (tuple(S.search(template)) if sets is None else tuple(
        S.search(template, sets=sets)))


# DEFINE THE TESTS

relationKey = {
예제 #25
0
else:
    locations = {}
if not locations:
    raise Exception('Please add your data paths in bhsa.py line 30.')
for path in locations:
    if not os.path.exists(path):
        raise Exception(
            f'You need an extra datamodule in {os.path.dirname(path)}. Do "git pull {locations[path]}" to this location.'
        )

# load TF and BHSA data
TF = Fabric(locations=locations.keys(), modules='2017', silent=True)
api = TF.load('''
              otype language
              book chapter verse
              function domain
              typ pdp kind tree
              crossref
              ''',
              silent=True)

api.makeAvailableIn(globals())  # globalize TF methods

# define book groups & names

lbh_books = ('Song_of_songs', 'Ecclesiastes', 'Esther', 'Daniel', 'Ezra',
             'Nehemiah', '1_Chronicles', '2_Chronicles')
sbh_books = ('Genesis', 'Exodus', 'Leviticus', 'Deuteronomy', 'Joshua',
             'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings')
test_books = ('Jonah', 'Ruth')

all_books = tuple(T.sectionFromNode(b)[0]
예제 #26
0
    ENTRY_HEB = "voc_lex_utf8"
    PHONO_TRAILER = "phono_trailer"
    LANGUAGE = "languageISO"

# In[ ]:

TF = Fabric(locations=[thisRepo, phonoRepo], modules=[tfDir])
api = TF.load(f"""
        g_cons g_cons_utf8 g_word g_word_utf8 trailer_utf8
        {QERE} {QERE_TRAILER}
        {LANGUAGE} lex g_lex lex_utf8 sp pdp ls
        {ENTRY} {ENTRY_HEB}
        vt vs gn nu ps st
        nme pfm prs uvf vbe vbs
        gloss nametype root ls
        pargr
        phono {PHONO_TRAILER}
        function typ rela txt det
        code tab
        number
        freq_lex freq_occ
        rank_lex rank_occ
        book chapter verse
""")
api.makeAvailableIn(globals())

# In[6]:

hasLex = "lex" in set(F.otype.all)

# # Data model
예제 #27
0
import os, sys, collections
from tf.fabric import Fabric

# locations = '~/github/etcbc'
locations = '/home/oem/text-fabric-data/etcbc'
coreModule = 'bhsa'
sources = [coreModule, 'phono']
# version = '2017'
version = 'c'
tempDir = os.path.expanduser(f'{locations}/{coreModule}/_temp/{version}/r')
tableFile = f'{tempDir}/{coreModule}{version}.txt'

modules = [f'{s}/tf/{version}' for s in sources]
TF = Fabric(locations=locations, modules=modules)

api = TF.load('')
api = TF.load(
    ('suffix_person', 'tab', 'trailer', 'trailer_utf8', 'txt', 'typ', 'uvf',
     'vbe', 'vbs', 'verse', 'voc_lex', 'voc_lex_utf8', 'vs', 'vt',
     'distributional_parent', 'functional_parent', 'mother', 'oslots'))
allFeatures = TF.explore(silent=False, show=True)
loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
del (api)
api = TF.load(loadableFeatures)
api.makeAvailableIn(globals())

print('done')
예제 #28
0
VERSION = "0.2"

TF_PATH = f"{TF_DIR}/{VERSION}"
TF = Fabric(locations=TF_PATH)
# -

# We ask for a list of all features:

allFeatures = TF.explore(silent=True, show=True)
loadableFeatures = allFeatures["nodes"] + allFeatures["edges"]
loadableFeatures

# We load all features:

api = TF.load(loadableFeatures, silent=False)

# You see that all files are marked with a `T`.
#
# That means that Text-Fabric loads the features by reading the plain text `.tf` files.
# But after reading, it makes a binary equivalent and stores it as a `.tfx`
# file in the hidden `.tf` directory next to it.
#
# Furthermore, you see some lines marked with `C`. Here Text-Fabric is computing derived data,
# mostly about sections, the order of nodes, and the relative positions of nodes with respect to the slots they
# are linked to.
#
# The results of this pre-computation are also stored in that hidden `.tf` directory.
#
# The next time, Text-Fabric loads the data from their binary `.tfx` files, which is much faster.
# And the pre-computation step will be skipped.
예제 #29
0
def genTrees(version):
    C = setVersion(version)
    bhsa = C.bhsa
    sp = C.sp
    rela = C.rela
    ptyp = C.ptyp
    ctyp = C.ctyp
    g_word_utf8 = C.g_word_utf8
    tfDir = C.tfDir

    TF = Fabric(locations=f"{GH}/{ORG}", modules=bhsa)
    api = TF.load(f"{sp} {rela} {ptyp} {ctyp} {g_word_utf8} mother")

    E = api.E
    F = api.F
    Fs = api.Fs

    def getTag(node):
        otype = F.otype.v(node)
        tag = TYPE_TABLE[otype]
        if tag == "P":
            tag = Fs(ptyp).v(node)
        elif tag == "C":
            tag = ccrTable[Fs(rela).v(node)]
        isWord = tag == ""
        pos = POS_TABLE[Fs(sp).v(node)] if isWord else None
        slot = node if isWord else None
        text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None
        return (tag, pos, slot, text, isWord)

    def getTagN(node):
        otype = F.otype.v(node)
        tag = TYPE_TABLE[otype]
        if tag == "P":
            tag = Fs(ptyp).v(node)
        elif tag == "C":
            tag = ccrTable[Fs(rela).v(node)]
        isWord = tag == ""
        if not isWord:
            tag += "{" + str(node) + "}"
        pos = POS_TABLE[Fs(sp).v(node)] if isWord else None
        slot = node if isWord else None
        text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None
        return (tag, pos, slot, text, isWord)

    treeTypes = ("sentence", "clause", "phrase", "subphrase", "word")
    (rootType, leafType, clauseType, phraseType) = (
        treeTypes[0],
        treeTypes[-1],
        treeTypes[1],
        treeTypes[2],
    )
    ccrTable = dict((c[0], c[1][1]) for c in CCR_INFO.items())
    ccrClass = dict((c[0], c[1][0]) for c in CCR_INFO.items())

    tree = Tree(
        TF,
        otypes=treeTypes,
        phraseType=phraseType,
        clauseType=clauseType,
        ccrFeature=rela,
        ptFeature=ptyp,
        posFeature=sp,
        motherFeature="mother",
    )

    tree.restructureClauses(ccrClass)
    results = tree.relations()
    TF.info("Ready for processing")

    skip = set()
    TF.info("Verifying whether all slots are preserved under restructuring")
    TF.info(f"Expected mismatches: {EXPECTED_MISMATCHES.get(version, '??')}")

    errors = []
    # i = 10
    for snode in F.otype.s(rootType):
        declaredSlots = set(E.oslots.s(snode))
        results = {}
        thisgood = {}
        for kind in ("e", "r"):
            results[kind] = set(lt for lt in tree.getLeaves(snode, kind)
                                if F.otype.v(lt) == leafType)
            thisgood[kind] = declaredSlots == results[kind]
            # if not thisgood[kind]:
            #    print(f"{kind} D={declaredSlots}\n  L={results[kind]}")
            #    i -= 1
        # if i == 0: break
        if False in thisgood.values():
            errors.append((snode, thisgood["e"], thisgood["r"]))
    nErrors = len(errors)
    if nErrors:
        TF.error(f"{len(errors)} mismatches:")
        mine = min(20, len(errors))
        skip |= {e[0] for e in errors}
        for (s, e, r) in errors[0:mine]:
            TF.error(
                (f"{s} embedding: {'OK' if e else 'XX'};"
                 f" restructd: {'OK' if r else 'XX'}"),
                tm=False,
            )
    else:
        TF.info(f"{len(errors)} mismatches")

    TF.info(f"Exporting {rootType} trees to TF")
    s = 0
    chunk = 10000
    sc = 0
    treeData = {}
    treeDataN = {}
    for node in F.otype.s(rootType):
        if node in skip:
            continue
        (treeRep, wordsRep, bSlot) = tree.writeTree(node,
                                                    "r",
                                                    getTag,
                                                    rev=False,
                                                    leafNumbers=True)
        (treeNRep, wordsNRep, bSlotN) = tree.writeTree(node,
                                                       "r",
                                                       getTagN,
                                                       rev=False,
                                                       leafNumbers=True)
        treeData[node] = treeRep
        treeDataN[node] = treeNRep
        s += 1
        sc += 1
        if sc == chunk:
            TF.info(f"{s} trees composed")
            sc = 0
    TF.info(f"{s} trees composed")

    nodeFeatures = dict(tree=treeData, treen=treeDataN)
    metaData = dict(
        tree=dict(
            valueType="str",
            description="penn treebank represententation for sentences",
            converter="Dirk Roorda",
            convertor="trees.ipynb",
            url="https://github.com/etcbc/trees/trees.ipynb",
            coreData="BHSA",
            coreVersion=version,
        ),
        treen=dict(
            valueType="str",
            description=
            "penn treebank represententation for sentences with node numbers included",
            converter="Dirk Roorda",
            convertor="trees.ipynb",
            url="https://github.com/etcbc/trees/trees.ipynb",
            coreData="BHSA",
            coreVersion=version,
        ),
    )
    TF.info("Writing tree feature to TF")
    TFw = Fabric(locations=tfDir, silent=True)
    TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)
예제 #30
0
sp = "part_of_speech" if VERSION == "3" else "sp"
rela = "clause_constituent_relation" if VERSION == "3" else "rela"
ptyp = "phrase_type" if VERSION == "3" else "typ"
ctyp = "clause_atom_type" if VERSION == "3" else "typ"
g_word_utf8 = "text" if VERSION == "3" else "g_word_utf8"
# -


# In[7]:


api = TF.load(
    f"""
    {sp} {rela} {ptyp} {ctyp}
    {g_word_utf8}
    mother
"""
)
api.makeAvailableIn(globals())


# We are going to make convenient labels for constituents, words and clauses, based on the
# the types of textual objects and the features
# `sp` and `rela`.

# ## Node types

# In[8]: