예제 #1
0
    def __init__(self,
                 np2vec_model_file,
                 binary=False,
                 word_ngrams=False,
                 grouping=False):
        """
        Load the np2vec model for set expansion.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.

        Returns:
            np2vec model to load
        """
        self.grouping = grouping
        if grouping:
            # load grouping info
            logger.info('loading grouping data')
            self.id2rep = load_json_file(path.join(cur_dir, 'id2rep'))
            self.np2id = load_json_file(path.join(cur_dir, 'np2id'))
            self.id2group = load_json_file(path.join(cur_dir, 'id2group'))
        logger.info('loadind model...')
        self.np2vec_model = NP2vec.load(np2vec_model_file,
                                        binary=binary,
                                        word_ngrams=word_ngrams)
        # extract the first term of the model in order to get the marking character
        logger.info('compute L2 norm')
        first_term = next(iter(self.np2vec_model.vocab.keys()))
        self.mark_char = first_term[-1]
        # Precompute L2-normalized vectors.
        self.np2vec_model.init_sims()
        logger.info('done init')
예제 #2
0
 def is_stop(token: str) -> bool:
     if not StringUtils.stop_words:
         StringUtils.stop_words = load_json_file(STOP_WORDS_FILE)
         StringUtils.stop_words.extend(DISAMBIGUATION_CATEGORY)
     if token not in StringUtils.stop_words:
         return False
     return True
예제 #3
0
    def is_pronoun(in_str: str) -> bool:
        if not StringUtils.pronouns:
            StringUtils.pronouns = load_json_file(PRONOUN_FILE)

        tokens = in_str.split()
        if len(tokens) == 1:
            if tokens[0] in StringUtils.pronouns:
                return True
        return False
예제 #4
0
    def is_determiner(in_str: str) -> bool:
        if not StringUtils.determiners:
            StringUtils.determiners = load_json_file(DETERMINERS_FILE)

        tokens = in_str.split()
        if len(tokens) == 1:
            if tokens[0] in StringUtils.determiners:
                return True
        return False
예제 #5
0
 def load_mentions_from_file(self, mentions_file_path: str) -> List[Topic]:
     start_data_load = time.time()
     logger.info('Loading mentions from-%s', mentions_file_path)
     mentions = load_json_file(mentions_file_path)
     topics = self.order_mentions_by_topics(mentions)
     end_data_load = time.time()
     took_load = end_data_load - start_data_load
     logger.info('Mentions file-%s, took:%.4f sec to load', mentions_file_path, took_load)
     return topics
예제 #6
0
    def is_preposition(in_str: str) -> bool:
        if not StringUtils.preposition:
            StringUtils.preposition = load_json_file(PREPOSITION_FILE)

        tokens = in_str.split()
        if len(tokens) == 1:
            if tokens[0] in StringUtils.preposition:
                return True
        return False
    def __init__(self, wd_file: str):
        """
        Extract Relation between two mentions according to Within document co-reference

        Args:
            wd_file (required): str Location of within doc co-reference mentions file
        """
        wd_mentions_json = load_json_file(wd_file)
        self.within_doc_coref_chain = self.arrange_resource(wd_mentions_json)
        super(WithinDocCoref, self).__init__()
예제 #8
0
    def __init__(
        self,
        np2vec_model_file,
        binary=False,
        word_ngrams=False,
        grouping=False,
        light_grouping=False,
        grouping_map_dir=None,
    ):
        """
        Load the np2vec model for set expansion.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.
            light_grouping (bool): boolean indicating whether to load all maps for grouping.
            grouping_map_dir (str): path to the directory containing maps for grouping.
        Returns:
            np2vec model to load
        """
        self.grouping = grouping
        if grouping:
            # load grouping info
            logger.info("loading grouping data")
            if not grouping_map_dir:
                grouping_map_dir = path.dirname(np2vec_model_file)
            self.np2id = load_json_file(path.join(grouping_map_dir, "np2id"))
            if not light_grouping:
                self.id2rep = load_json_file(path.join(grouping_map_dir, "id2rep"))
                self.id2group = load_json_file(path.join(grouping_map_dir, "id2group"))
        logger.info("loadind model...")
        self.np2vec_model = NP2vec.load(np2vec_model_file, binary=binary, word_ngrams=word_ngrams)
        # extract the first term of the model in order to get the marking character
        logger.info("compute L2 norm")
        first_term = next(iter(self.np2vec_model.vocab.keys()))
        self.mark_char = first_term[-1]
        # Precompute L2-normalized vectors.
        self.np2vec_model.init_sims()
        logger.info("done init")
예제 #9
0
    def __init__(self, method: OnlineOROfflineMethod, ref_dict: str = None):
        """
        Extract Relation between two mentions according to Referent Dictionary knowledge

        Args:
            method (required): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full referent
                dictionary or a sub-set of
            ref_dict (required): str Location of referent dictionary file to work with
        """
        logger.info('Loading ReferentDict module')
        if method == OnlineOROfflineMethod.OFFLINE:
            self.ref_dict = load_json_file(ref_dict)
        elif method == OnlineOROfflineMethod.ONLINE:
            self.ref_dict = self.load_reference_dict(ref_dict)
        logger.info('ReferentDict module lead successfully')
        super(ReferentDictRelationExtraction, self).__init__()
    def load_dump(self, wn_dump):
        onlyfiles = []
        for _file in listdir(wn_dump):
            file_path = join(wn_dump, _file)
            if isfile(file_path):
                onlyfiles.append(file_path)

        json_dump_list = {}
        for _file in onlyfiles:
            json_dump_list.update(load_json_file(_file))

        dump_final = {}
        for key, value in json_dump_list.items():
            dump_final[key] = self.extract_json_values(value)

        return dump_final
예제 #11
0
    def __init__(self, method: OnlineOROfflineMethod, vo_file: str):
        """
        Extract Relation between two mentions according to VerbOcean knowledge

        Args:
            method (required): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full VerbOcean or
                a sub-set of it
            vo_file (required): str Location of VerbOcean file to work with
        """
        logger.info('Loading Verb Ocean module')
        if method == OnlineOROfflineMethod.OFFLINE:
            self.vo = load_json_file(vo_file)
        elif method == OnlineOROfflineMethod.ONLINE:
            self.vo = self.load_verbocean_file(vo_file)
        logger.info('Verb Ocean module lead successfully')
        super(VerboceanRelationExtraction, self).__init__()
    def __init__(self, wd_file: str):
        """
        Extract Relation between two mentions according to Within document co-reference

        Args:
            wd_file (required): str Location of within doc co-reference mentions file
        """
        logger.info("Loading Within doc resource")
        if wd_file is not None and os.path.isfile(wd_file):
            wd_mentions_json = load_json_file(wd_file)
            self.within_doc_coref_chain = self.arrange_resource(
                wd_mentions_json)
        else:
            raise FileNotFoundError(
                "Within-doc resource file not found or not in path")
        super(WithinDocCoref, self).__init__()
예제 #13
0
    def from_config(cls, word_vocab_size: int, num_labels: int, config: str):
        """
        Load a model from a configuration file
        A valid configuration file is a JSON file with fields as in class `__init__`

        Args:
            word_vocab_size (int): word vocabulary size
            num_labels (int): number of labels (classifier)
            config (str): path to configuration file

        Returns:
            IDCNN: IDCNNEmbedder module pre-configured
        """
        if not os.path.exists(config):
            raise FileNotFoundError
        cfg = load_json_file(config)
        return cls(word_vocab_size=word_vocab_size, num_labels=num_labels, **cfg)
예제 #14
0
    def read_mentions_json_to_mentions_data_list(mentions_json_file: str):
        """

        Args:
            mentions_json_file: the path of the mentions json file to read

        Returns:
            List[MentionData]
        """
        all_mentions_only = load_json_file(mentions_json_file)

        mentions = []
        for mention_line in all_mentions_only:
            mention_data = MentionData.read_json_mention_data_line(mention_line)
            mentions.append(mention_data)

        return mentions
예제 #15
0
    def __init__(self,
                 method: OnlineOROfflineMethod = OnlineOROfflineMethod.ONLINE,
                 vo_file: str = None):
        """
        Extract Relation between two mentions according to VerbOcean knowledge

        Args:
            method (optional): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full VerbOcean or
                a sub-set of it (default = ONLINE)
            vo_file (required): str Location of VerbOcean file to work with
        """
        logger.info('Loading Verb Ocean module')
        if vo_file is not None and os.path.isfile(vo_file):
            if method == OnlineOROfflineMethod.OFFLINE:
                self.vo = load_json_file(vo_file)
            elif method == OnlineOROfflineMethod.ONLINE:
                self.vo = self.load_verbocean_file(vo_file)
            logger.info('Verb Ocean module lead successfully')
        else:
            raise FileNotFoundError(
                'VerbOcean file not found or not in path..')
        super(VerboceanRelationExtraction, self).__init__()
    def __init__(self,
                 method: OnlineOROfflineMethod = OnlineOROfflineMethod.ONLINE,
                 ref_dict: str = None):
        """
        Extract Relation between two mentions according to Referent Dictionary knowledge

        Args:
            method (optional): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full referent
                dictionary or a sub-set of (default = ONLINE)
            ref_dict (required): str Location of referent dictionary file to work with
        """
        logger.info('Loading ReferentDict module')
        if ref_dict is not None and os.path.isfile(ref_dict):
            if method == OnlineOROfflineMethod.OFFLINE:
                self.ref_dict = load_json_file(ref_dict)
            elif method == OnlineOROfflineMethod.ONLINE:
                self.ref_dict = self.load_reference_dict(ref_dict)
            logger.info('ReferentDict module lead successfully')
        else:
            raise FileNotFoundError(
                'Referent Dict file not found or not in path:' + ref_dict)

        super(ReferentDictRelationExtraction, self).__init__()
예제 #17
0
def get_from_cache(url: str, cache_dir: str = None) -> str:
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
        cache_dir = MODEL_CACHE

    os.makedirs(cache_dir, exist_ok=True)

    response = requests.head(url, allow_redirects=True)
    if response.status_code != 200:
        raise IOError("HEAD request failed for url {} with status code {}"
                      .format(url, response.status_code))
    etag = response.headers.get("ETag")

    filename = url_to_filename(url, etag)

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

    need_downloading = True

    if os.path.exists(cache_path):
        # check if etag has changed comparing with the metadata
        if url.split('/')[-1].endswith('zip'):
            meta_path = cache_path + '.json'
        else:
            meta_path = cache_path + '_meta_' + '.json'
        meta = load_json_file(meta_path)
        if meta['etag'] == etag:
            print('file already present')
            need_downloading = False

    if need_downloading:
        print("File not present or etag changed")
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        with tempfile.NamedTemporaryFile() as temp_file:
            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)

            # GET file object
            http_get(url, temp_file)

            # we are copying the file before closing it, so flush to avoid truncation
            temp_file.flush()
            # shutil.copyfileobj() starts at the current position, so go to the start
            temp_file.seek(0)

            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)

            logger.info("creating metadata file for %s", cache_path)
            meta = {'url': url, 'etag': etag}
            if url.split('/')[-1].endswith('zip'):
                meta_path = cache_path + '.json'
            else:
                meta_path = cache_path + '_meta_' + '.json'
            with open(meta_path, 'w') as meta_file:
                json.dump(meta, meta_file)

            logger.info("removing temp file %s", temp_file.name)

    return cache_path, need_downloading