def _split_and_tokenize(s): """ Helper, sentence-splits and tokenizes, returns array comparable to what you would get from re.split(r'(\s+)', s). """ from ssplit import en_sentence_boundary_gen from tokenise import en_token_boundary_gen tokens = [] sprev = 0 for sstart, send in en_sentence_boundary_gen(s): if sprev != sstart: # between-sentence space tokens.append(s[sprev:sstart]) stext = s[sstart:send] tprev = sstart for tstart, tend in en_token_boundary_gen(stext): if tprev != tstart: # between-token space tokens.append(s[sstart+tprev:sstart+tstart]) tokens.append(s[sstart+tstart:sstart+tend]) tprev = tend sprev = send if sprev != len(s): # document-final space tokens.append(s[sprev:]) assert "".join(tokens) == s, "INTERNAL ERROR\n'%s'\n'%s'" % ("".join(tokens),s) return tokens
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' ' + unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info # First, generate tokenisation if JAPANESE: from tokenise import jp_token_boundary_gen token_offsets = [o for o in jp_token_boundary_gen(text)] else: from tokenise import en_token_boundary_gen token_offsets = [o for o in en_token_boundary_gen(text)] j_dic['token_offsets'] = token_offsets if NEWLINE_SS: from ssplit import newline_sentence_boundary_gen sentence_offsets = [o for o in newline_sentence_boundary_gen(text)] elif JAPANESE: from ssplit import jp_sentence_boundary_gen sentence_offsets = [o for o in jp_sentence_boundary_gen(text)] #log_info('offsets: ' + str(offsets)) else: from ssplit import en_sentence_boundary_gen sentence_offsets = [o for o in en_sentence_boundary_gen(text)] #log_info('offsets: ' + str(sentence_offsets)) j_dic['sentence_offsets'] = sentence_offsets return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error('Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' '+unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info # First, generate tokenisation if JAPANESE: from tokenise import jp_token_boundary_gen token_offsets = [o for o in jp_token_boundary_gen(text)] else: from tokenise import en_token_boundary_gen token_offsets = [o for o in en_token_boundary_gen(text)] j_dic['token_offsets'] = token_offsets if NEWLINE_SS: from ssplit import newline_sentence_boundary_gen sentence_offsets = [o for o in newline_sentence_boundary_gen(text)] elif JAPANESE: from ssplit import jp_sentence_boundary_gen sentence_offsets = [o for o in jp_sentence_boundary_gen(text)] #log_info('offsets: ' + str(offsets)) else: from ssplit import en_sentence_boundary_gen sentence_offsets = [o for o in en_sentence_boundary_gen(text)] #log_info('offsets: ' + str(sentence_offsets)) j_dic['sentence_offsets'] = sentence_offsets return True