Exemplo n.º 1
0
def write_boolean(f, key, DOrds):
    '''
    Boolean [Mirrored etc (Technically three-value 
      e.g. 0 for False 1 for True U for undefined)]
    TODO: Should this be a range?
    uses char as type
    Variables: LValues[Default U]
    '''
    LValues = get_uni_array() # [+1]
    
    LRanges, DOrds = compress_ord_ranges(DOrds)
    
    LIgnoreRanges = get_char_gaps(DOrds)
    DOrds = ranges_to_single_ords(DOrds)
    
    # Fill the data gaps
    if DOrds:
        for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
            if ord_ in DOrds:
                if DOrds[ord_]:
                    LValues.append('1')
                else: 
                    LValues.append('0')
            else: 
                LValues.append('U')
    else:
        assert LRanges
    
    # Write to disk
    DRtn = {}
    DRtn['LValues'] = write_array(f, LValues)
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    DRtn['LRanges'] = write_json(f, LRanges)
    return DRtn
Exemplo n.º 2
0
def write_encoding(f, key, DOrds):
    DFlags, DOrds = coorce_to_encodings(DOrds)
    LRanges, DOrds = compress_ord_ranges(DOrds)
    DOrds = ranges_to_single_ords(DOrds)
    LIgnoreRanges = get_char_gaps(DOrds)
    '''
    Encoding [BigFive etc]
    Variables: LSeek[+1] -> LValues[+1]
    '''
    LSeek = get_int_array()  # [+1]
    '''
    LValues should be [+1], but can't be used as "FFFF" 
    will be chopped off. Instead I've used the *first*
    value to specify the number of values at that seekpoint, 
    similar to pascal strings which don't use \0's
    '''
    LValues = get_int_array()  # [should be +1, but can't]
    '''
    Store flags assigned to variants if there's a source 
    associated to it, e.g. in Unihan's kSemanticVariant 
    "U+7A69<kFenn,kMatthews" might store kFenn as `1` 
    and kMatthews as `2`, making the flag for U+7A69 `3`
    '''
    LFlags = get_int_array()

    if DOrds:
        for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
            if ord_ in DOrds:
                LEnc = DOrds[ord_]

                LSeek.append(len(LValues) + 1)

                LValues.append(len(LEnc))
                LFlags.append(0)  # NOTE ME!

                for enc, flags in LEnc:
                    LValues.append(enc)
                    LFlags.append(flags)
            else:
                LSeek.append(0)
    else:
        assert LRanges

    # Write to disk
    DRtn = {}
    DRtn['LSeek'] = write_array(f, LSeek)
    DRtn['LValues'] = write_array(f, LValues)
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    DRtn['LRanges'] = write_json(f, LRanges)

    if DFlags:
        # Write flags to disk if there are any for this key
        DFlagsReversed = dict(
            (str(value), i_key) for i_key, value in list(DFlags.items()))
        DRtn['DFlags'] = write_json(f, DFlagsReversed)
        DRtn['LFlags'] = write_array(f, LFlags)

    return DRtn
Exemplo n.º 3
0
 def write(self, f):
     # And write to disk
     DRtn = {}
     DRtn['LSpell'] = write_json(f, list(self.SSpell))
     DRtn['LHash'] = write_array(f, self.LHash)
     DRtn['LOrds'] = write_array(f, self.LOrds)
     DRtn['deinflect_iso'] = self.iso
     return DRtn
Exemplo n.º 4
0
def write_integer_list(f, key, DOrds):
    """
    IntegerList [Grades/Frequencies (storing only numbers)]
    """
    DOrds = coerce_to_int(DOrds)
    LRanges, DOrds = compress_ord_ranges(DOrds)
    LIgnoreRanges = get_char_gaps(DOrds)

    LShort = get_int_array()  # [+1]

    # Fill the data gaps - TODO: ADD SUPPORT FOR MULTIPLE VALUES!
    DMultiVals = {}
    for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
        #print 'ord_:', ord_
        if ord_ in DOrds:
            L = DOrds[ord_]
            LShort.append(int(L[0]) + 1)

            if len(L) > 1:
                # Append to `DMultiVals` if there is a
                # character with multiple integers mappings!
                assert not str(ord_) in DMultiVals
                DMultiVals[str(ord_)] = L[1:]
        else:
            LShort.append(0)

    # Write to disk
    print('WRITE LShort!')
    DRtn = {}
    DRtn['LShort'] = write_array(f, LShort)
    print('WRITE LIgnoreRanges!')
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    print('WRITE LRanges!')
    DRtn['LRanges'] = write_json(f, LRanges)
    print('WRITE DMultiVals!')
    DRtn['DMultiVals'] = write_json(f, DMultiVals)
    print(('OK:', DRtn))
    return DRtn
Exemplo n.º 5
0
def write_string_keys_index(f, key, DData):
    """
    Will be stored by dicts, e.g.
    {'Arabic': [array.array('I')], ...}
    """
    DStringKeys = {}
    LRanges = []

    # First go through the e.g. Unicode-specified ranges
    for ord_ in DData:
        #if key =='subblock_heading':
        #    print('DData[ord]:', DData[ord_])

        for value in iter_values(DData[ord_]):
            #if key == 'subblock_heading':
            #    print(value, ord_)

            if not value in DStringKeys:
                # Create a get_int_array for each value->ordinals mapping
                DStringKeys[value] = get_int_array()

            if type(ord_) in (list, tuple):
                # A range, so reprocess below
                LRanges.append((value, ord_))
            else:
                # A single value, so add just the codepoint
                DStringKeys[value].append(ord_)

    #if len(DStringKeys) > 10000:
    #    print(('StringKeys Ignored Because of Size:', key, len(DStringKeys)))
    #    return

    LRangesOut = []

    for value, (from_, to) in LRanges:
        # Normal Data
        LRangesOut.append([from_, to, value])

    # Write to disk
    # TODO: Divide larger keys into smaller categories?
    DRtn = {}
    DRtn['DStringKeys'] = write_arrays(f, DStringKeys)
    DRtn['LRanges'] = write_json(f, LRangesOut)

    return DRtn
Exemplo n.º 6
0
def write_sentence_data(f, key, DOrds):
    """
    Names [UnicodeData (single), NamesList (multiple)]
      LWordLinks separated by 1 to allow multiple names,
      Ended by a 0 [LWordLinks+=2 to compensate]
    References each word by a unsigned short (65535 max dictionary words)
    Variables: LSeek[+1; 0 means "no link/value for this codepoint"] -> 
               LWordLinks[+2] -> 
               LWords[\v terminated]
    """
    LSeek = get_int_array()  # [+1]
    LAmount = get_int_array()
    LWordLinks = get_int_array()  # [+2]
    LWords = get_uni_array()

    LRanges, DOrds = compress_ord_ranges(
        DOrds
    )  # MASSIVE SPACE WASTAGE WARNING! ==================================
    LIgnoreRanges = get_char_gaps(DOrds)

    DWords = {}

    def get_word_seek(word):
        if not word in DWords:
            # Add the word and seek info
            # if the word not in DWords
            seek = len(LWords)
            amount = LWords.extend(word)

            DWords[word] = (seek, amount)
        return DWords[word]

    DWordLinks = {}

    def get_wordlinks_seek(value):
        # This requires a list type to allow multiple
        # definitions, so if it isn't convert it to one
        if not type(value) in (list, tuple):
            value = [value]
        value = tuple(value)

        if value in DWordLinks:
            return DWordLinks[value]

        # Process each definition seperately
        i_LWords = [i.split() for i in value]
        wordlinks_seek = len(LWordLinks)

        i = 1
        for LSentence in i_LWords:
            for word in LSentence:
                # Append to LWordLinks[+2]
                seek, amount = get_word_seek(word)
                LWordLinks.append(seek + 2)
                LAmount.append(amount)

            if i != len(i_LWords):
                # 1 signifies multiple names
                # Only happens if not the last item
                LWordLinks.append(1)
                LAmount.append(0)
            i += 1

        # 0 signifies the end of sequence
        LWordLinks.append(0)
        LAmount.append(0)

        # Store for next time
        DWordLinks[value] = wordlinks_seek
        return wordlinks_seek

    if DOrds:
        for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
            if ord_ in DOrds:
                #print 'CODEPOINT FOUND:', ord_
                seek = get_wordlinks_seek(DOrds[ord_])
                LSeek.append(seek + 1)
            else:
                LSeek.append(0)

    # Make the values in the LRanges to point
    # to seek positions as well to save space
    LRanges = [(from_, to, get_wordlinks_seek(value))
               for from_, to, value in LRanges]

    # Write to disk
    DRtn = {}
    DRtn['LSeek'] = write_array(f, LSeek)
    DRtn['LWordLinks'] = write_array(f, LWordLinks)
    DRtn['LAmount'] = write_array(f, LAmount)
    DRtn['LWords'] = write_array(f, LWords)
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    DRtn['LRanges'] = write_json(f, LRanges)
    return DRtn
Exemplo n.º 7
0
def write_string_data(f, key, DOrds):
    """
    StringData (storing string lists by number to save space)
    """
    # Variables: 
    # + LRanges
    LRanges, DOrds = compress_ord_ranges(DOrds)
    
    # + LSeek[+1][Ranges Subtracted] -> LWords[\v terminated]
    LSeek = get_int_array() # [+1]
    LAmount = get_int_array()
    LWords = get_uni_array()
    
    DWordSeek = {}
    def get_L_seek(L):
        if not isinstance(L, (tuple, list)):
            L = [L]
        
        LRtn = []
        for data in L:
            # Append the seek position for string `data`
            if not data in DWordSeek:
                seek = len(LWords)
                amount = LWords.extend(str(data))
                DWordSeek[data] = (seek, amount)

            LRtn.append(
                DWordSeek[data]
            )
        return LRtn
    
    LIgnoreRanges = []
    if DOrds:
        # Only process if not Blocks etc
        LIgnoreRanges = get_char_gaps(DOrds)
        #print 'CHARGAPS:', LIgnoreRanges
        
        for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
            #print Key, ord_
            if ord_ in DOrds:
                # And add the DWordSeek link to LSeek [+1]
                data = DOrds[ord_]
                
                for seek, amount in get_L_seek(data):
                    LSeek.append(seek+1)
                    LAmount.append(amount)
            else:
                LSeek.append(0)
                LAmount.append(0) # NOTE ME: It may pay to make this -1 and add assertions (!)
    
    # Make the values in the LRanges to point 
    # to seek positions as well to save space
    n_LRanges = []
    for from_, to, value in LRanges:
        a = get_L_seek(value)
        a = a[0] if len(a)==1 else a
        n_LRanges.append((from_, to, a))
    
    # Write to disk
    DRtn = {}
    DRtn['LRanges'] = write_json(f, n_LRanges)
    DRtn['LSeek'] = write_array(f, LSeek)
    DRtn['LAmount'] = write_array(f, LAmount)
    DRtn['LWords'] = write_array(f, LWords)
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    return DRtn
Exemplo n.º 8
0
def write_indices(f, key, DOrds):
    """
    Indices [Storing Page Positions as Either Numeric or Char Data]
    Variables: DArrays
    """
    LArrays, DOrds = indice_tools.parse_indices(key, DOrds)
    DArrays = {}
    
    LRanges, DOrds = compress_ord_ranges(DOrds)
    
    LIgnoreRanges = get_char_gaps(DOrds)
    
    # Create the various arrays
    for name, typ in LArrays:
        if typ == 'char':
            # [\v] indicates None
            DArrays[name] = get_uni_array()
        elif typ == 'integer': 
            # [+1] to allow 0 for None
            DArrays[name] = get_int_array()
        else:
            raise Exception("Unknown indice type %s" % typ)
    
    # Fill the data gaps
    DMultiVals = {}
    for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
        if ord_ in DOrds and DOrds[ord_]:
            LValues = DOrds[ord_]
            D = LValues[0]
            
            # Add to DMultiVals if more than one value
            if len(LValues) > 1:
                assert not str(ord_) in DMultiVals
                DMultiVals[str(ord_)] = LValues[1:]
            
            for k in D:
                value = D[k]
                LArray = DArrays[k]
                
                if value in (None, ''):
                    if LArray.typecode in ('u', 'c'):
                        LArray.append('\v')
                    else: 
                        LArray.append(0)
                else:
                    try:
                        if LArray.typecode in ('u', 'c'):
                            assert len(value) == 1
                            # WARNING: StrArray's only allow indexing of single ASCII
                            # chars, as they're encoded using utf-8 (!)

                            # Hopefully that's all it will need, though
                            LArray.append(str(value))
                        else: 
                            LArray.append(int(value)+1)
                    except:
                        print(('Error on value: %s key: %s typecode: %s' % (value, k, LArray.typecode)))
                        raise
        else: 
            for k in DArrays:
                # Append blank values
                LArray = DArrays[k]
                if LArray.typecode in ('u', 'c'):
                    LArray.append('\v')
                else: 
                    LArray.append(0)
    
    # Write to disk
    DRtn = {}
    DRtn['DArrays'] = write_arrays(f, [(name, DArrays[name]) for name, typ in LArrays])
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    DRtn['LRanges'] = write_json(f, LRanges)
    DRtn['DMultiVals'] = write_json(f, DMultiVals)
    return DRtn