예제 #1
0
파일: search.py 프로젝트: dmcc/brat
def _split_and_tokenize(s):
    """
    Helper, sentence-splits and tokenizes, returns array comparable to
    what you would get from re.split(r'(\s+)', s).
    """
    from ssplit import en_sentence_boundary_gen
    from tokenise import en_token_boundary_gen

    tokens = []

    sprev = 0
    for sstart, send in en_sentence_boundary_gen(s):
        if sprev != sstart:
            # between-sentence space
            tokens.append(s[sprev:sstart])
        stext = s[sstart:send]
        tprev = sstart
        for tstart, tend in en_token_boundary_gen(stext):
            if tprev != tstart:
                # between-token space
                tokens.append(s[sstart+tprev:sstart+tstart])
            tokens.append(s[sstart+tstart:sstart+tend])
            tprev = tend
        sprev = send

    if sprev != len(s):
        # document-final space
        tokens.append(s[sprev:])

    assert "".join(tokens) == s, "INTERNAL ERROR\n'%s'\n'%s'" % ("".join(tokens),s)

    return tokens
예제 #2
0
def _split_and_tokenize(s):
    """
    Helper, sentence-splits and tokenizes, returns array comparable to
    what you would get from re.split(r'(\s+)', s).
    """
    from ssplit import en_sentence_boundary_gen
    from tokenise import en_token_boundary_gen

    tokens = []

    sprev = 0
    for sstart, send in en_sentence_boundary_gen(s):
        if sprev != sstart:
            # between-sentence space
            tokens.append(s[sprev:sstart])
        stext = s[sstart:send]
        tprev = sstart
        for tstart, tend in en_token_boundary_gen(stext):
            if tprev != tstart:
                # between-token space
                tokens.append(s[sstart+tprev:sstart+tstart])
            tokens.append(s[sstart+tstart:sstart+tend])
            tprev = tend
        sprev = send

    if sprev != len(s):
        # document-final space
        tokens.append(s[sprev:])

    assert "".join(tokens) == s, "INTERNAL ERROR\n'%s'\n'%s'" % ("".join(tokens),s)

    return tokens
예제 #3
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    # TODO XXX huge hack, sorry, the client currently crashing on
    # chrome for two or more consecutive space, so replace every
    # second with literal non-breaking space. Note that this is just
    # for the client display -- server-side storage is not affected.
    # NOTE: it might be possible to fix this in a principled way by
    # having xml:space="preserve" on the relevant elements.
    text = text.replace("  ", ' ' + unichr(0x00A0))

    j_dic['text'] = text

    from logging import info as log_info

    # First, generate tokenisation
    if JAPANESE:
        from tokenise import jp_token_boundary_gen
        token_offsets = [o for o in jp_token_boundary_gen(text)]
    else:
        from tokenise import en_token_boundary_gen
        token_offsets = [o for o in en_token_boundary_gen(text)]
    j_dic['token_offsets'] = token_offsets

    if NEWLINE_SS:
        from ssplit import newline_sentence_boundary_gen
        sentence_offsets = [o for o in newline_sentence_boundary_gen(text)]
    elif JAPANESE:
        from ssplit import jp_sentence_boundary_gen
        sentence_offsets = [o for o in jp_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(offsets))
    else:
        from ssplit import en_sentence_boundary_gen
        sentence_offsets = [o for o in en_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(sentence_offsets))
    j_dic['sentence_offsets'] = sentence_offsets

    return True
예제 #4
0
파일: document.py 프로젝트: edycop/brat
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error('Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    # TODO XXX huge hack, sorry, the client currently crashing on
    # chrome for two or more consecutive space, so replace every
    # second with literal non-breaking space. Note that this is just
    # for the client display -- server-side storage is not affected.
    # NOTE: it might be possible to fix this in a principled way by
    # having xml:space="preserve" on the relevant elements.
    text = text.replace("  ", ' '+unichr(0x00A0))

    j_dic['text'] = text
    
    from logging import info as log_info

    # First, generate tokenisation
    if JAPANESE:
        from tokenise import jp_token_boundary_gen
        token_offsets = [o for o in jp_token_boundary_gen(text)]
    else:
        from tokenise import en_token_boundary_gen
        token_offsets = [o for o in en_token_boundary_gen(text)]
    j_dic['token_offsets'] = token_offsets

    if NEWLINE_SS:
        from ssplit import newline_sentence_boundary_gen
        sentence_offsets = [o for o in newline_sentence_boundary_gen(text)]
    elif JAPANESE:
        from ssplit import jp_sentence_boundary_gen
        sentence_offsets = [o for o in jp_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(offsets))
    else:
        from ssplit import en_sentence_boundary_gen
        sentence_offsets = [o for o in en_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(sentence_offsets))
    j_dic['sentence_offsets'] = sentence_offsets

    return True