def remap(multisegs): useg=unicode(multisegs[0], 'utf-8') chars=chinese_utils.segment_into_chars_rs(useg) # construct bit vector from each segmentation. with 0 representing a boundary. bvs=[] for seg in multisegs: bv=construct_bv(seg) bvs.append(bv) # & all the bvs, getting a new bv. This new bv represents the new segmentation. for i in range(1, len(bvs)): bvs[0] = bvs[0] & bvs[i] # returns a re-mapped (re-segmented) sentence. new_words=[] start=0 for i in range(len(bvs[0])): if bvs[0][i]==0: if int(i)>start: new_words.append(''.join(chars[int(start):int(i)])) start=i unewseg=u' '.join(new_words) return unewseg.encode('utf-8')
def remap(multisegs): useg = unicode(multisegs[0], 'utf-8') chars = chinese_utils.segment_into_chars_rs(useg) # construct bit vector from each segmentation. with 0 representing a boundary. bvs = [] for seg in multisegs: bv = construct_bv(seg) bvs.append(bv) # & all the bvs, getting a new bv. This new bv represents the new segmentation. for i in range(1, len(bvs)): bvs[0] = bvs[0] & bvs[i] # returns a re-mapped (re-segmented) sentence. new_words = [] start = 0 for i in range(len(bvs[0])): if bvs[0][i] == 0: if int(i) > start: new_words.append(''.join(chars[int(start):int(i)])) start = i unewseg = u' '.join(new_words) return unewseg.encode('utf-8')
def seg(line,dict): ret = "" uline=unicode(line,"utf-8") l=chinese_utils.segment_into_chars_rs(uline) l.reverse() output=word_segment(l,dict) for i in range(len(output)): ret += "%s " % output[i].encode("utf-8") return ret
def construct_bv(seg): useg=unicode(seg, 'utf-8') chars=chinese_utils.segment_into_chars_rs(useg) bv=BitVector.BitVector(size=len(chars)+1) for i in range(len(bv)): bv[i]=1 bnds=compile_word_boundaries(seg) for b in bnds: bv[b[0]]=0 bv[b[1]]=0 return bv
def construct_bv(seg): useg = unicode(seg, 'utf-8') chars = chinese_utils.segment_into_chars_rs(useg) bv = BitVector.BitVector(size=len(chars) + 1) for i in range(len(bv)): bv[i] = 1 bnds = compile_word_boundaries(seg) for b in bnds: bv[b[0]] = 0 bv[b[1]] = 0 return bv
def compute_char_belong_to(f): f_unicode = unicode(f, "utf-8") fwords = f_unicode.split() w_index = 0 ret = [] for w in fwords: chars = chinese_utils.segment_into_chars_rs(w) for c in chars: ret.append(w_index) w_index = w_index + 1 return ret
def compute_char_belong_to(f): f_unicode=unicode(f, "utf-8") fwords=f_unicode.split() w_index=0 ret=[] for w in fwords: chars=chinese_utils.segment_into_chars_rs(w) for c in chars: ret.append(w_index) w_index=w_index+1 return ret
def compile_word_boundaries(f): f_unicode = unicode(f, "utf-8") fwords = f_unicode.split() ret = [] char_index = 0 for w in fwords: chars = chinese_utils.segment_into_chars_rs(w) start = char_index for c in chars: char_index = char_index + 1 ret.append([start, char_index]) return ret
def compile_word_boundaries(f): f_unicode=unicode(f,"utf-8") fwords=f_unicode.split() ret=[] char_index=0 for w in fwords: chars=chinese_utils.segment_into_chars_rs(w) start=char_index for c in chars: char_index=char_index+1 ret.append([start, char_index]) return ret