Exemplo n.º 1
0
def convert(data, src):
    """
    Conversion utility

    :param str data: raw data from third party tool
    :param str src: source type
    :returns dict: a complete message with text and annotations
    """
    # Fail early if we don't have a converter
    try:
        conv_text, conv_ann = CONV_BY_SRC[src]
    except KeyError:
        raise InvalidSrcFormat(src)

    # Note: Due to a lack of refactoring we need to write to disk to read
    #   annotions, once this is fixed, the below code needs some clean-up
    tmp_dir = None
    try:
        tmp_dir = mkdtemp()
        doc_base = path_join(tmp_dir, 'tmp')
        with open_textfile(doc_base + '.txt', 'w') as txt_file:
            txt_file.write(conv_text(data))
        with open(doc_base + '.ann', 'w'):
            pass

        with Annotations(doc_base) as ann_obj:
            for ann in conv_ann(data):
                ann_obj.add_annotation(ann)

        json_dic = _document_json_dict(doc_base)
        # Note: Blank the comments, they rarely do anything good but whine
        #   about configuration when we use the tool solely for visualisation
        #   purposes
        json_dic['comments'] = []

        # Note: This is an ugly hack... we want to ride along with the
        #   Stanford tokenisation and sentence splits when returning their
        #   output rather than relying on the ones generated by arat.
        if src.startswith('stanford-'):
            json_dic['token_offsets'] = stanford_token_offsets(data)
            json_dic['sentence_offsets'] = stanford_sentence_offsets(data)

        return json_dic
    finally:
        if tmp_dir is not None:
            rmtree(tmp_dir)
Exemplo n.º 2
0
def download_file(document, collection, extension):
    """
    Send file content
    TODO: don't rely on NoPrintJSONError to encapsulate the data
          a better mechanism must be designed
    TODO: potential security hole, extensive tests must be written
    """
    directory = collection
    real_dir = real_directory(directory)
    fname = '%s.%s' % (document, extension)
    fpath = join(real_dir, fname)

    hdrs = [('Content-Type', 'text/plain; charset=utf-8'),
            ('Content-Disposition', 'inline; filename=%s' % fname)]
    with open_textfile(fpath, 'r') as txt_file:
        data = txt_file.read().encode('utf-8')

    return hdrs, data
Exemplo n.º 3
0
def save_import(text, docid, collection=None):
    '''
    TODO: DOC:
    '''

    directory = collection

    if directory is None:
        dir_path = DATA_DIR
    else:
        # XXX: These "security" measures can surely be fooled
        if (directory.count('../') or directory == '..'):
            raise InvalidDirError(directory)

        dir_path = real_directory(directory)

    # Is the directory a directory and are we allowed to write?
    if not isdir(dir_path):
        raise InvalidDirError(dir_path)
    if not access(dir_path, W_OK):
        raise NoWritePermissionError(dir_path)

    base_path = join_path(dir_path, docid)
    txt_path = base_path + '.' + TEXT_FILE_SUFFIX
    ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF

    # Before we proceed, verify that we are not overwriting
    for path in (txt_path, ann_path):
        if isfile(path):
            raise FileExistsError(path)

    # Make sure we have a valid POSIX text file, i.e. that the
    # file ends in a newline.
    if text != "" and text[-1] != '\n':
        text = text + '\n'

    with open_textfile(txt_path, 'w') as txt_file:
        txt_file.write(text)

    # Touch the ann file so that we can edit the file later
    with open(ann_path, 'w') as _:
        pass

    return {'document': docid}
Exemplo n.º 4
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    j_dic['text'] = text

    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    tok_offset_gen = tokeniser_by_name(tokeniser)
    j_dic['token_offsets'] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == 'newline':
        from arat.server.ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == 'regex':
        from arat.server.ssplit import regex_sentence_boundary_gen
        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning('Unrecognized sentence splitting option '
                         ', reverting to newline sentence splitting.')
        from arat.server.ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)]

    return True
Exemplo n.º 5
0
def __read_or_default(filename, default):
    """
    >>> config_filename = "data/example-data/corpora/CoNLL-ST_2006/tools.conf"
    >>> config = __read_or_default(config_filename, six.u("missed"))
    >>> six.u("Tokens") in config
    True


    >>> __read_or_default("does/not/exists/tools.conf", six.u("missed")) == six.u("missed")
    True

    """

    try:
        f = open_textfile(filename, 'r')
        r = f.read()
        f.close()
        return r
    except FileNotFoundError:
        # TODO: specific exception handling and reporting
        return default
    except IOError:
        return default
Exemplo n.º 6
0
if __name__ == '__main__':
    from sys import argv

    from arat.server.annotation import open_textfile

    def _text_by_offsets_gen(text, offsets):
        for start, end in offsets:
            yield text[start:end]

    if len(argv) > 1:
        try:
            for txt_file_path in argv[1:]:
                print()
                print('### Splitting:', txt_file_path)
                with open_textfile(txt_file_path, 'r') as txt_file:
                    text = txt_file.read()
                print('# Original text:')
                print(text.replace('\n', '\\n'))
                offsets = [o for o in newline_sentence_boundary_gen(text)]
                print('# Offsets:')
                print(offsets)
                print('# Sentences:')
                for sentence in _text_by_offsets_gen(text, offsets):
                    # These should only be allowed when coming from original
                    #   explicit newlines.
                    #assert sentence, 'blank sentences disallowed'
                    # assert not sentence[0].isspace(), (
                    #        'sentence may not start with white-space "%s"' % sentence)
                    print('"%s"' % sentence.replace('\n', '\\n'))
        except IOError:
Exemplo n.º 7
0
def get_directory_information(collection, user):
    directory = collection

    real_dir = real_directory(directory)

    assert_allowed_to_read(real_dir, user)

    # Get the document names
    base_names = [
        fn[0:-4] for fn in _listdir(real_dir, user) if fn.endswith('txt')
    ]

    doclist = base_names[:]
    doclist_header = [("Document", "string")]

    # Then get the modification times
    doclist_with_time = []
    for file_name in doclist:
        file_path = path_join(DATA_DIR, real_dir,
                              file_name + "." + JOINED_ANN_FILE_SUFF)
        doclist_with_time.append([file_name, _getmtime(file_path)])
    doclist = doclist_with_time
    doclist_header.append(("Modified", "time"))

    try:
        stats_types, doc_stats = get_statistics(real_dir, base_names)
    except OSError:
        # something like missing access permissions?
        raise CollectionNotAccessibleError

    doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))]
    doclist_header += stats_types

    dirlist = [
        i for i in _listdir(real_dir, user) if isdir(path_join(real_dir, i))
    ]
    # just in case, and for generality
    dirlist = [[i] for i in dirlist]

    # check whether at root, ignoring e.g. possible trailing slashes
    if normpath(real_dir) != normpath(DATA_DIR):
        parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:]
        # to get consistent processing client-side, add explicitly to list
        dirlist.append([".."])
    else:
        parent = None

    # combine document and directory lists, adding a column
    # differentiating files from directories and an unused column (can
    # point to a specific annotation) required by the protocol.  The
    # values filled here for the first are "c" for "collection"
    # (i.e. directory) and "d" for "document".
    combolist = []
    for i in dirlist:
        combolist.append(["c", None] + i)
    for i in doclist:
        combolist.append(["d", None] + i)

    # plug in the search config too
    search_config = get_search_config(real_dir)

    # ... and the disambiguator config ... this is getting a bit much
    disambiguator_config = get_disambiguator_config(real_dir)

    # ... and the normalization config (TODO: rethink)
    normalization_config = get_normalization_config(real_dir)

    # read in README (if any) to send as a description of the
    # collection
    try:
        with open_textfile(path_join(real_dir, "README")) as txt_file:
            readme_text = txt_file.read()
    except IOError:
        readme_text = None

    # fill in a flag for whether annotator logging is active so that
    # the client knows whether to invoke timing actions
    ann_logging = annotation_logging_active(real_dir)

    # fill in NER services, if any
    ner_taggers = get_annotator_config(real_dir)

    return _inject_annotation_type_conf(real_dir,
                                        json_dic={
                                            'items': combolist,
                                            'header': doclist_header,
                                            'parent': parent,
                                            'messages': [],
                                            'description': readme_text,
                                            'search_config': search_config,
                                            'disambiguator_config':
                                            disambiguator_config,
                                            'normalization_config':
                                            normalization_config,
                                            'annotation_logging': ann_logging,
                                            'ner_taggers': ner_taggers,
                                        })