def write_radical_strokes_index(f, key, DData): ''' Stores the Kangxi radical/additional strokes in a dict e.g. {"150'.5": [array.array('I'), ...], "150'": [array.array('I')]} Stores by both: * (radical).(additional strokes) format as well as: * (radical) so that you can search even if you don't know the number of strokes. Simplified radicals are indicated by a final "'". ''' DArrays = {} for ord_ in DData: for value in DData[ord_]: no_strokes_value = value.split('.')[0] # no added strokes if not value in DArrays: DArrays[value] = get_int_array() if not no_strokes_value in DArrays: DArrays[no_strokes_value] = get_int_array() DArrays[value].append(ord_) DArrays[no_strokes_value].append(ord_) # Write to disk return write_arrays(f, DArrays)
def write_encoding(f, key, DOrds): DFlags, DOrds = coorce_to_encodings(DOrds) LRanges, DOrds = compress_ord_ranges(DOrds) DOrds = ranges_to_single_ords(DOrds) LIgnoreRanges = get_char_gaps(DOrds) ''' Encoding [BigFive etc] Variables: LSeek[+1] -> LValues[+1] ''' LSeek = get_int_array() # [+1] ''' LValues should be [+1], but can't be used as "FFFF" will be chopped off. Instead I've used the *first* value to specify the number of values at that seekpoint, similar to pascal strings which don't use \0's ''' LValues = get_int_array() # [should be +1, but can't] ''' Store flags assigned to variants if there's a source associated to it, e.g. in Unihan's kSemanticVariant "U+7A69<kFenn,kMatthews" might store kFenn as `1` and kMatthews as `2`, making the flag for U+7A69 `3` ''' LFlags = get_int_array() if DOrds: for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)): if ord_ in DOrds: LEnc = DOrds[ord_] LSeek.append(len(LValues) + 1) LValues.append(len(LEnc)) LFlags.append(0) # NOTE ME! for enc, flags in LEnc: LValues.append(enc) LFlags.append(flags) else: LSeek.append(0) else: assert LRanges # Write to disk DRtn = {} DRtn['LSeek'] = write_array(f, LSeek) DRtn['LValues'] = write_array(f, LValues) DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges) DRtn['LRanges'] = write_json(f, LRanges) if DFlags: # Write flags to disk if there are any for this key DFlagsReversed = dict( (str(value), i_key) for i_key, value in list(DFlags.items())) DRtn['DFlags'] = write_json(f, DFlagsReversed) DRtn['LFlags'] = write_array(f, LFlags) return DRtn
def __init__(self, key, DData, LISO=None): """ For readings in other languages etc where the data isn't in English Write the index to disk, using stem FIXME: ADD RANGE SUPPORT! TODO: REMOVE DUPE WORDS? """ self.key = key self.DData = DData #self.LISO = LISO self.iso = key_to_iso(key) print('FulltextWriter ISO:', key, self.iso) self.SSpell = set() L = [] for ord_ in list(self.DData.keys()): value = self.DData.get(ord_, []) LValues = value if isinstance(value, (list, tuple)) else [value] for value in LValues: if self.iso and self.iso == 'ltc': # Tang dynasty Chinese L.extend(self.get_L_tang(ord_, value)) elif self.iso: L.extend(self.get_L_inflected(ord_, value)) else: L.extend(self.get_L_general(ord_, value)) L.sort() # Convert to c array types LHash = self.LHash = get_int_array(signed=False) LOrds = self.LOrds = get_int_array() for hash_, ord_ in L: LHash.append(hash_) LOrds.append(ord_)
def write_string_keys_index(f, key, DData): """ Will be stored by dicts, e.g. {'Arabic': [array.array('I')], ...} """ DStringKeys = {} LRanges = [] # First go through the e.g. Unicode-specified ranges for ord_ in DData: #if key =='subblock_heading': # print('DData[ord]:', DData[ord_]) for value in iter_values(DData[ord_]): #if key == 'subblock_heading': # print(value, ord_) if not value in DStringKeys: # Create a get_int_array for each value->ordinals mapping DStringKeys[value] = get_int_array() if type(ord_) in (list, tuple): # A range, so reprocess below LRanges.append((value, ord_)) else: # A single value, so add just the codepoint DStringKeys[value].append(ord_) #if len(DStringKeys) > 10000: # print(('StringKeys Ignored Because of Size:', key, len(DStringKeys))) # return LRangesOut = [] for value, (from_, to) in LRanges: # Normal Data LRangesOut.append([from_, to, value]) # Write to disk # TODO: Divide larger keys into smaller categories? DRtn = {} DRtn['DStringKeys'] = write_arrays(f, DStringKeys) DRtn['LRanges'] = write_json(f, LRangesOut) return DRtn
def write_integer_list(f, key, DOrds): """ IntegerList [Grades/Frequencies (storing only numbers)] """ DOrds = coerce_to_int(DOrds) LRanges, DOrds = compress_ord_ranges(DOrds) LIgnoreRanges = get_char_gaps(DOrds) LShort = get_int_array() # [+1] # Fill the data gaps - TODO: ADD SUPPORT FOR MULTIPLE VALUES! DMultiVals = {} for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)): #print 'ord_:', ord_ if ord_ in DOrds: L = DOrds[ord_] LShort.append(int(L[0]) + 1) if len(L) > 1: # Append to `DMultiVals` if there is a # character with multiple integers mappings! assert not str(ord_) in DMultiVals DMultiVals[str(ord_)] = L[1:] else: LShort.append(0) # Write to disk print('WRITE LShort!') DRtn = {} DRtn['LShort'] = write_array(f, LShort) print('WRITE LIgnoreRanges!') DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges) print('WRITE LRanges!') DRtn['LRanges'] = write_json(f, LRanges) print('WRITE DMultiVals!') DRtn['DMultiVals'] = write_json(f, DMultiVals) print(('OK:', DRtn)) return DRtn
def write_sentence_data(f, key, DOrds): """ Names [UnicodeData (single), NamesList (multiple)] LWordLinks separated by 1 to allow multiple names, Ended by a 0 [LWordLinks+=2 to compensate] References each word by a unsigned short (65535 max dictionary words) Variables: LSeek[+1; 0 means "no link/value for this codepoint"] -> LWordLinks[+2] -> LWords[\v terminated] """ LSeek = get_int_array() # [+1] LAmount = get_int_array() LWordLinks = get_int_array() # [+2] LWords = get_uni_array() LRanges, DOrds = compress_ord_ranges( DOrds ) # MASSIVE SPACE WASTAGE WARNING! ================================== LIgnoreRanges = get_char_gaps(DOrds) DWords = {} def get_word_seek(word): if not word in DWords: # Add the word and seek info # if the word not in DWords seek = len(LWords) amount = LWords.extend(word) DWords[word] = (seek, amount) return DWords[word] DWordLinks = {} def get_wordlinks_seek(value): # This requires a list type to allow multiple # definitions, so if it isn't convert it to one if not type(value) in (list, tuple): value = [value] value = tuple(value) if value in DWordLinks: return DWordLinks[value] # Process each definition seperately i_LWords = [i.split() for i in value] wordlinks_seek = len(LWordLinks) i = 1 for LSentence in i_LWords: for word in LSentence: # Append to LWordLinks[+2] seek, amount = get_word_seek(word) LWordLinks.append(seek + 2) LAmount.append(amount) if i != len(i_LWords): # 1 signifies multiple names # Only happens if not the last item LWordLinks.append(1) LAmount.append(0) i += 1 # 0 signifies the end of sequence LWordLinks.append(0) LAmount.append(0) # Store for next time DWordLinks[value] = wordlinks_seek return wordlinks_seek if DOrds: for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)): if ord_ in DOrds: #print 'CODEPOINT FOUND:', ord_ seek = get_wordlinks_seek(DOrds[ord_]) LSeek.append(seek + 1) else: LSeek.append(0) # Make the values in the LRanges to point # to seek positions as well to save space LRanges = [(from_, to, get_wordlinks_seek(value)) for from_, to, value in LRanges] # Write to disk DRtn = {} DRtn['LSeek'] = write_array(f, LSeek) DRtn['LWordLinks'] = write_array(f, LWordLinks) DRtn['LAmount'] = write_array(f, LAmount) DRtn['LWords'] = write_array(f, LWords) DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges) DRtn['LRanges'] = write_json(f, LRanges) return DRtn
def write_string_data(f, key, DOrds): """ StringData (storing string lists by number to save space) """ # Variables: # + LRanges LRanges, DOrds = compress_ord_ranges(DOrds) # + LSeek[+1][Ranges Subtracted] -> LWords[\v terminated] LSeek = get_int_array() # [+1] LAmount = get_int_array() LWords = get_uni_array() DWordSeek = {} def get_L_seek(L): if not isinstance(L, (tuple, list)): L = [L] LRtn = [] for data in L: # Append the seek position for string `data` if not data in DWordSeek: seek = len(LWords) amount = LWords.extend(str(data)) DWordSeek[data] = (seek, amount) LRtn.append( DWordSeek[data] ) return LRtn LIgnoreRanges = [] if DOrds: # Only process if not Blocks etc LIgnoreRanges = get_char_gaps(DOrds) #print 'CHARGAPS:', LIgnoreRanges for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)): #print Key, ord_ if ord_ in DOrds: # And add the DWordSeek link to LSeek [+1] data = DOrds[ord_] for seek, amount in get_L_seek(data): LSeek.append(seek+1) LAmount.append(amount) else: LSeek.append(0) LAmount.append(0) # NOTE ME: It may pay to make this -1 and add assertions (!) # Make the values in the LRanges to point # to seek positions as well to save space n_LRanges = [] for from_, to, value in LRanges: a = get_L_seek(value) a = a[0] if len(a)==1 else a n_LRanges.append((from_, to, a)) # Write to disk DRtn = {} DRtn['LRanges'] = write_json(f, n_LRanges) DRtn['LSeek'] = write_array(f, LSeek) DRtn['LAmount'] = write_array(f, LAmount) DRtn['LWords'] = write_array(f, LWords) DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges) return DRtn
def write_integer_keys_index(f, key, DData, prefix=None): ''' DNum -> {value: number of items} where `value` might be a grade number ''' DNum = {} for ord_ in DData: if isinstance(DData[ord_], (list, tuple)): # HACK: Use only the first value if there are multiple grades etc # FIXME: Support multiple values! =================================================== DData[ord_] = DData[ord_][0] value = int(DData[ord_]) if not value in DNum: DNum[value] = 0 DNum[value] += 1 # The maximum value migth be e.g. 10 for # grade 10 < CHECK USED CORRECTLY! ============================================================= max_ = max(DNum) min_ = min(DNum) DRtn = {} if len(DNum) > 50: ''' Frequency values, usually thousands of possible values Filter them down by Max/50 to allow rough frequency browsing TODO: Make it a bit less "linear" than dividing by 50? For example: * Frequencies 0-49 * Frequencies 50-99 * Frequencies 100-149 ...etc ''' i = min_ step = int(max_ / 50.0) or 1 while 1: if i > max_: from_ = i to = max_ else: from_ = i to = i + step - 1 i += step format = '%s - %s' % (from_, to) if prefix: format = '%s %s' % (prefix, format) L = get_int_array() for ord_ in DData: if (int(DData[ord_]) >= int(from_) and int(DData[ord_]) <= int(to)): #print 'ADDED:', from_, to, unicode(DData[ord_]).encode('utf-8') L.append(ord_) #else: # print 'NOT ADDED:', from_, to, unicode(DData[ord_]).encode('utf-8') if len(L): DRtn[format] = L # Stop looping if no more items! if i > max_: break else: ''' If less than 15 values, they're usually grade values which have few possibilities, so divides into 'sets' of 100 or so, For example: * Grade 1 (0-99) * Grade 1 (100-177) * Grade 2 (0-99) ...etc ''' LValues = list(DNum.keys()) LValues.sort() # for value in LValues: num_steps = int(DNum[value] / 100.0) + 1 for i in range(num_steps): # Add on the last maximum from_ = i * 100 to = from_ + 99 if to > DNum[value]: # If no more items, use the max number for "to" to = DNum[value] # Write the index to DRtn # TODO: Add "Page" etc for indices? xx = 0 format = '%s (%s - %s)' % (value, from_ + 1, to + 1) L = get_int_array() for ord_ in DData: if str(DData[ord_]) == str(value): if xx >= from_ and xx <= to: #print('ADDED:', i, xx, unicode(value).encode('utf-8')) L.append(ord_) #else: # print('NOT ADDED:', i, xx, unicode(value).encode('utf-8')) xx += 1 if len(L): DRtn[format] = L # Write out to disk using unsigned integer arrays return write_arrays(f, DRtn)
def write_indices(f, key, DOrds): """ Indices [Storing Page Positions as Either Numeric or Char Data] Variables: DArrays """ LArrays, DOrds = indice_tools.parse_indices(key, DOrds) DArrays = {} LRanges, DOrds = compress_ord_ranges(DOrds) LIgnoreRanges = get_char_gaps(DOrds) # Create the various arrays for name, typ in LArrays: if typ == 'char': # [\v] indicates None DArrays[name] = get_uni_array() elif typ == 'integer': # [+1] to allow 0 for None DArrays[name] = get_int_array() else: raise Exception("Unknown indice type %s" % typ) # Fill the data gaps DMultiVals = {} for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)): if ord_ in DOrds and DOrds[ord_]: LValues = DOrds[ord_] D = LValues[0] # Add to DMultiVals if more than one value if len(LValues) > 1: assert not str(ord_) in DMultiVals DMultiVals[str(ord_)] = LValues[1:] for k in D: value = D[k] LArray = DArrays[k] if value in (None, ''): if LArray.typecode in ('u', 'c'): LArray.append('\v') else: LArray.append(0) else: try: if LArray.typecode in ('u', 'c'): assert len(value) == 1 # WARNING: StrArray's only allow indexing of single ASCII # chars, as they're encoded using utf-8 (!) # Hopefully that's all it will need, though LArray.append(str(value)) else: LArray.append(int(value)+1) except: print(('Error on value: %s key: %s typecode: %s' % (value, k, LArray.typecode))) raise else: for k in DArrays: # Append blank values LArray = DArrays[k] if LArray.typecode in ('u', 'c'): LArray.append('\v') else: LArray.append(0) # Write to disk DRtn = {} DRtn['DArrays'] = write_arrays(f, [(name, DArrays[name]) for name, typ in LArrays]) DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges) DRtn['LRanges'] = write_json(f, LRanges) DRtn['DMultiVals'] = write_json(f, DMultiVals) return DRtn