Exemplo n.º 1
0
    def construct_all_from_folder(
            dictionaries_folder,
            string_tokenizer=(lambda x: x.split()),
            case_sensitive=False,
            hdfs_url=None,
            hdfs_user=None,
            stop_words=None,
            accepted_extensions=[".dic", "dict", ".txt", ".tsv", ".csv"]):
        def accept_filename_fun(filename):
            return any(
                filename.endswith(accepted_extension)
                for accepted_extension in accepted_extensions)

        hdfs_client = maybe_get_hdfs_client(hdfs_url, hdfs_user)

        if hdfs_client:
            # hdfs
            dic_paths = walk_hdfs_directory(hdfs_client, dictionaries_folder,
                                            accept_filename_fun)
            read_function = DictionaryFeatureGenerator.__hdfs_read_function(
                hdfs_client)

        else:
            # local file system
            dic_paths = (path for path in glob.glob(
                os.path.join(dictionaries_folder, "*"), recursive=True)
                         if accept_filename_fun(path))
            read_function = DictionaryFeatureGenerator.__localfs_read_function

        #

        return DictionaryFeatureGenerator.__read_dictionaries(
            dic_paths, read_function, string_tokenizer, case_sensitive,
            stop_words)
Exemplo n.º 2
0
    def construct_all_from_paths(
            dictionaries_paths,
            string_tokenizer=(lambda x: x.split()),
            case_sensitive=False,
            hdfs_url=None,
            hdfs_user=None,
            stop_words=None,
            accepted_extensions=[".dic", "dict", ".txt", ".tsv", ".csv"]):
        if type(dictionaries_paths) is str:
            dictionaries_paths = dictionaries_paths.split()

        hdfs_client = maybe_get_hdfs_client(hdfs_url, hdfs_user)

        if hdfs_client:
            read_function = DictionaryFeatureGenerator.__hdfs_read_function(
                hdfs_client)

        else:
            read_function = DictionaryFeatureGenerator.__localfs_read_function

        #

        return DictionaryFeatureGenerator.__read_dictionaries(
            dictionaries_paths, read_function, string_tokenizer,
            case_sensitive, stop_words)
Exemplo n.º 3
0
    def __init__(self,
                 directory,
                 read_only_class_id=None,
                 delete_incomplete_docs=True,
                 is_predicted=False,
                 read_relations=False,
                 whole_basename_as_docid=False,
                 raise_exception_on_incosistencies=True,
                 hdfs_url=None,
                 hdfs_user=None):
        self.directory = directory
        """the directory containing *.ann.json files"""

        if read_only_class_id is not None and not isinstance(
                read_only_class_id, list):
            read_only_class_id = [read_only_class_id]
        self.read_only_class_id = read_only_class_id
        """whether to read in only entities with given class_id's (single id or list of). Otherwise if None, read all entities"""

        self.delete_incomplete_docs = delete_incomplete_docs
        """delete documents from the dataset that are not marked as 'anncomplete' provided the docs are not predicted"""
        self.is_predicted = is_predicted
        """whether the annotation is predicted or real, which determines where it will be saved"""
        self.read_relations = read_relations
        """whether relations should be read as well"""
        self.whole_basename_as_docid = whole_basename_as_docid
        self.raise_exception_on_incosistencies = raise_exception_on_incosistencies

        self.hdfs_client = maybe_get_hdfs_client(hdfs_url, hdfs_user)
Exemplo n.º 4
0
    def __init__(self,
                 path,
                 whole_basename_as_docid=False,
                 hdfs_url=None,
                 hdfs_user=None):
        self.path = path
        """an html file or a directory containing .html files"""
        self.whole_basename_as_docid = whole_basename_as_docid

        self.hdfs_client = maybe_get_hdfs_client(hdfs_url, hdfs_user)