Пример #1
0
    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
Пример #2
0
    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
Пример #3
0
    def __init__(self,
                 nlp=None,
                 spacy_model=SPACY_MODEL,
                 greedyness=GREEDYNESS,
                 max_dist=MAX_DIST,
                 max_dist_match=MAX_DIST_MATCH,
                 max_follow_up=MAX_FOLLOW_UP,
                 conll=None,
                 use_no_coref_list=True,
                 debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.max_follow_up = max_follow_up
        self.debug = debug

        print(
            f'PARAMS: greedyness: {greedyness}, max_dist: {max_dist}, '
            f'max_dist_match: {max_dist_match}, max_follow_up {max_follow_up}')

        if nlp is None:
            print("Loading spacy model")
            nlp = spacy.load(spacy_model)

        model_path = os.path.join(
            PACKAGE_DIRECTORY,
            "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp,
                         model_path=embed_model_path,
                         conll=conll,
                         use_no_coref_list=use_no_coref_list,
                         consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
Пример #4
0
class Coref:
    '''
    Main coreference resolution algorithm
    '''
    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}

    ###################################
    #### ENTITY CLUSTERS FUNCTIONS ####
    ###################################

    def _prepare_clusters(self):
        '''
        Clean up and prepare one cluster for each mention
        '''
        self.mention_to_cluster = list(range(len(self.data.mentions)))
        self.clusters = dict((i, [i]) for i in self.mention_to_cluster)
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
        for mention in self.mention_to_cluster:
            self.mentions_single_scores[mention] = None
            self.mentions_single_features[mention] = None
            self.mentions_pairs_scores[mention] = {}
            self.mentions_pairs_features[mention] = {}

    def _merge_coreference_clusters(self, ant_idx, mention_idx):
        '''
        Merge two clusters together
        '''
        if self.mention_to_cluster[ant_idx] == self.mention_to_cluster[mention_idx]:
            return

        remove_id = self.mention_to_cluster[ant_idx]
        keep_id = self.mention_to_cluster[mention_idx]
        for idx in self.clusters[remove_id]:
            self.mention_to_cluster[idx] = keep_id
            self.clusters[keep_id].append(idx)

        del self.clusters[remove_id]

    def display_clusters(self):
        '''
        Print clusters informations
        '''
        print(self.clusters)
        for key, mentions in self.clusters.items():
            print("cluster", key, "(", ", ".join(str(self.data[m]) for m in mentions), ")")

    ###################################
    ####### MAIN COREF FUNCTIONS ######
    ###################################

    def run_coref_on_mentions(self, mentions):
        '''
        Run the coreference model on a mentions iterator or list
        '''
        best_ant = {}
        n_ant = 0
        for mention_idx, ant_list in self.data.get_candidate_pairs(mentions, self.max_dist, self.max_dist_match):
            mention = self.data[mention_idx]
            feats_, ana_feats = self.data.get_single_mention_features(mention)
            anaphoricity_score = self.coref_model.get_single_mention_score(mention.embedding, ana_feats)
            self.mentions_single_scores[mention_idx] = anaphoricity_score
            self.mentions_single_features[mention_idx] = {"spansEmbeddings": mention.spans_embeddings_, "wordsEmbeddings": mention.words_embeddings_, "features": feats_}

            best_score = anaphoricity_score - 50 * (self.greedyness - 0.5)
            for ant_idx in ant_list:
                antecedent = self.data[ant_idx]
                feats_, pwf = self.data.get_pair_mentions_features(antecedent, mention)
                score = self.coref_model.get_pair_mentions_score(antecedent, mention, pwf)
                self.mentions_pairs_scores[mention_idx][ant_idx] = score
                self.mentions_pairs_features[mention_idx][ant_idx] = {"pairFeatures": feats_, "antecedentSpansEmbeddings": antecedent.spans_embeddings_,
                                                                      "antecedentWordsEmbeddings": antecedent.words_embeddings_,
                                                                      "mentionSpansEmbeddings": mention.spans_embeddings_,
                                                                      "mentionWordsEmbeddings": mention.words_embeddings_ }

                if score > best_score:
                    best_score = score
                    best_ant[mention_idx] = ant_idx
            if mention_idx in best_ant:
                n_ant += 1
                self._merge_coreference_clusters(best_ant[mention_idx], mention_idx)
        return (n_ant, best_ant)

    def run_coref_on_utterances(self, last_utterances_added=False, follow_chains=True):
        ''' Run the coreference model on some utterances

        Arg:
            last_utterances_added: run the coreference model over the last utterances added to the data
            follow_chains: follow coreference chains over previous utterances
        '''
        self._prepare_clusters()
        mentions = list(self.data.get_candidate_mentions(last_utterances_added=last_utterances_added))
        n_ant, antecedents = self.run_coref_on_mentions(mentions)
        mentions = antecedents.values()
        if follow_chains and n_ant > 0:
            i = 0
            while i < MAX_FOLLOW_UP:
                i += 1
                n_ant, antecedents = self.run_coref_on_mentions(mentions)
                mentions = antecedents.values()
                if n_ant == 0:
                    break

    def one_shot_coref(self, utterances, utterances_speakers_id=None, context=None,
                       context_speakers_id=None, speakers_names=None):
        ''' Clear history, load a list of utterances and run the coreference model on them

        Arg:
            utterances : iterator or list of string corresponding to successive utterances
            utterances_speaker : iterator or list of speaker id for each utterance.
                If not provided, assume two speakers speaking alternatively.
                if utterances and utterances_speaker are not of the same length padded with None
            context : same as utterances but coreferences are not computed for this,
                      only used as possible antecedent to utterances mentions
            context_speaker : same as utterances_speaker
            speakers_names : dictionnary of list of acceptable speaker names for each speaker id
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.set_utterances(context, context_speakers_id, speakers_names)
        self.continuous_coref(utterances, utterances_speakers_id, speakers_names)
        return self.get_clusters()

    def continuous_coref(self, utterances, utterances_speakers_id=None, speakers_names=None):
        '''
        Same as one-shot coref but don't clear the history.
        Only resolve coreferences for the mentions in the utterances
        (but use the mentions in previously loaded utterances as possible antecedents)
        '''
        self.data.add_utterances(utterances, utterances_speakers_id, speakers_names)
        self.run_coref_on_utterances(last_utterances_added=True, follow_chains=True)
        return self.get_clusters()

    ###################################
    ###### INFORMATION RETRIEVAL ######
    ###################################

    def get_scores(self):
        ''' Retrieve single and pair scores'''
        return {"single_scores": self.mentions_single_scores,
                "pair_scores": self.mentions_pairs_scores}

    def get_clusters(self, remove_singletons=True, use_no_coref_list=True):
        ''' Retrieve cleaned clusters'''
        clusters = self.clusters
        remove_id = []
        if use_no_coref_list:
            for key, mentions in clusters.items():
                cleaned_list = []
                for mention_idx in mentions:
                    mention = self.data.mentions[mention_idx]
                    if mention.lower_ in NO_COREF_LIST:
                        cleaned_list.append(mention_idx)
                        self.mention_to_cluster[mention_idx] = None
                clusters[key] = cleaned_list
            # Also clean up keys so we can build coref chains in self.get_most_representative
            added = {}
            for key, mentions in clusters.items():
                if self.data.mentions[key].lower_ in NO_COREF_LIST:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
                    if mentions:
                        added[mentions[0]] = mentions
            clusters.update(added)
        if remove_singletons:
            for key, mentions in clusters.items():
                if len(mentions) == 1:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
        for rem in remove_id:
            del clusters[rem]

        return clusters

    def get_most_representative(self, last_utterances_added=True, use_no_coref_list=True):
        '''
        Find a most representative mention for each cluster

        Return:
            Dictionnary of {original_mention: most_representative_resolved_mention, ...}
        '''
        clusters = self.get_clusters(remove_singletons=True, use_no_coref_list=use_no_coref_list)
        coreferences = {}
        for key in self.data.get_candidate_mentions(last_utterances_added=last_utterances_added):
            if self.mention_to_cluster[key] is None:
                continue
            mentions = clusters.get(self.mention_to_cluster[key], None)
            if mentions is None:
                continue
            representative = self.data.mentions[key]
            for mention_idx in mentions[1:]:
                mention = self.data.mentions[mention_idx]
                if mention.mention_type is not representative.mention_type:
                    if mention.mention_type == MENTION_TYPE["PROPER"] \
                        or (mention.mention_type == MENTION_TYPE["NOMINAL"] and
                                representative.mention_type == MENTION_TYPE["PRONOMINAL"]):
                        coreferences[self.data.mentions[key]] = mention
                        representative = mention

        return coreferences
Пример #5
0
class Coref:
    '''
    Main coreference resolution algorithm
    '''
    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}

    ###################################
    #### ENTITY CLUSTERS FUNCTIONS ####
    ###################################

    def _prepare_clusters(self):
        '''
        Clean up and prepare one cluster for each mention
        '''
        self.mention_to_cluster = list(range(len(self.data.mentions)))
        self.clusters = dict((i, [i]) for i in self.mention_to_cluster)
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
        for mention in self.mention_to_cluster:
            self.mentions_single_scores[mention] = None
            self.mentions_single_features[mention] = None
            self.mentions_pairs_scores[mention] = {}
            self.mentions_pairs_features[mention] = {}

    def _merge_coreference_clusters(self, ant_idx, mention_idx):
        '''
        Merge two clusters together
        '''
        if self.mention_to_cluster[ant_idx] == self.mention_to_cluster[mention_idx]:
            return

        remove_id = self.mention_to_cluster[ant_idx]
        keep_id = self.mention_to_cluster[mention_idx]
        for idx in self.clusters[remove_id]:
            self.mention_to_cluster[idx] = keep_id
            self.clusters[keep_id].append(idx)

        del self.clusters[remove_id]

    def display_clusters(self):
        '''
        Print clusters informations
        '''
        print(self.clusters)
        for key, mentions in self.clusters.items():
            print("cluster", key, "(", ", ".join(str(self.data[m]) for m in mentions), ")")

    ###################################
    ####### MAIN COREF FUNCTIONS ######
    ###################################

    def run_coref_on_mentions(self, mentions):
        '''
        Run the coreference model on a mentions iterator or list
        '''
        best_ant = {}
        n_ant = 0
        for mention_idx, ant_list in self.data.get_candidate_pairs(mentions, self.max_dist, self.max_dist_match):
            mention = self.data[mention_idx]
            feats_, ana_feats = self.data.get_single_mention_features(mention)
            anaphoricity_score = self.coref_model.get_single_mention_score(mention.embedding, ana_feats)
            self.mentions_single_scores[mention_idx] = anaphoricity_score
            self.mentions_single_features[mention_idx] = {"spansEmbeddings": mention.spans_embeddings_, "wordsEmbeddings": mention.words_embeddings_, "features": feats_}

            best_score = anaphoricity_score - 50 * (self.greedyness - 0.5)
            for ant_idx in ant_list:
                antecedent = self.data[ant_idx]
                feats_, pwf = self.data.get_pair_mentions_features(antecedent, mention)
                score = self.coref_model.get_pair_mentions_score(antecedent, mention, pwf)
                self.mentions_pairs_scores[mention_idx][ant_idx] = score
                self.mentions_pairs_features[mention_idx][ant_idx] = {"pairFeatures": feats_, "antecedentSpansEmbeddings": antecedent.spans_embeddings_,
                                                                      "antecedentWordsEmbeddings": antecedent.words_embeddings_,
                                                                      "mentionSpansEmbeddings": mention.spans_embeddings_,
                                                                      "mentionWordsEmbeddings": mention.words_embeddings_ }

                if score > best_score:
                    best_score = score
                    best_ant[mention_idx] = ant_idx
            if mention_idx in best_ant:
                n_ant += 1
                self._merge_coreference_clusters(best_ant[mention_idx], mention_idx)
        return (n_ant, best_ant)

    def run_coref_on_utterances(self, last_utterances_added=False, follow_chains=True):
        ''' Run the coreference model on some utterances

        Arg:
            last_utterances_added: run the coreference model over the last utterances added to the data
            follow_chains: follow coreference chains over previous utterances
        '''
        self._prepare_clusters()
        mentions = list(self.data.get_candidate_mentions(last_utterances_added=last_utterances_added))
        n_ant, antecedents = self.run_coref_on_mentions(mentions)
        mentions = antecedents.values()
        if follow_chains and n_ant > 0:
            i = 0
            while i < MAX_FOLLOW_UP:
                i += 1
                n_ant, antecedents = self.run_coref_on_mentions(mentions)
                mentions = antecedents.values()
                if n_ant == 0:
                    break

    def one_shot_coref(self, utterances, utterances_speakers_id=None, context=None,
                       context_speakers_id=None, speakers_names=None):
        ''' Clear history, load a list of utterances and an optional context and run the coreference model on them

        Arg:
        - `utterances` : iterator or list of string corresponding to successive utterances (in a dialogue) or sentences.
            Can be a single string for non-dialogue text.
        - `utterances_speakers_id=None` : iterator or list of speaker id for each utterance (in the case of a dialogue).
            - if not provided, assume two speakers speaking alternatively.
            - if utterances and utterances_speaker are not of the same length padded with None
        - `context=None` : iterator or list of string corresponding to additionnal utterances/sentences sent prior to `utterances`. Coreferences are not computed for the mentions identified in `context`. The mentions in `context` are only used as possible antecedents to mentions in `uterrance`. Reduce the computations when we are only interested in resolving coreference in the last sentences/utterances.
        - `context_speakers_id=None` : same as `utterances_speakers_id` for `context`. 
        - `speakers_names=None` : dictionnary of list of acceptable speaker names (strings) for speaker_id in `utterances_speakers_id` and `context_speakers_id`
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.set_utterances(context, context_speakers_id, speakers_names)
        self.continuous_coref(utterances, utterances_speakers_id, speakers_names)
        return self.get_clusters()

    def continuous_coref(self, utterances, utterances_speakers_id=None, speakers_names=None):
        '''
        Only resolve coreferences for the mentions in the utterances
        (but use the mentions in previously loaded utterances as possible antecedents)
        Arg:
            utterances : iterator or list of string corresponding to successive utterances
            utterances_speaker : iterator or list of speaker id for each utterance.
                If not provided, assume two speakers speaking alternatively.
                if utterances and utterances_speaker are not of the same length padded with None
            speakers_names : dictionnary of list of acceptable speaker names for each speaker id
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.add_utterances(utterances, utterances_speakers_id, speakers_names)
        self.run_coref_on_utterances(last_utterances_added=True, follow_chains=True)
        return self.get_clusters()

    ###################################
    ###### INFORMATION RETRIEVAL ######
    ###################################

    def get_utterances(self, last_utterances_added=True):
        ''' Retrieve the list of parsed uterrances'''
        if last_utterances_added:
            return [self.data.utterances[idx] for idx in self.data.last_utterances_loaded]
        else:
            return self.data.utterances

    def get_resolved_utterances(self, last_utterances_added=True, use_no_coref_list=True):
        ''' Return a list of utterrances text where the '''
        coreferences = self.get_most_representative(last_utterances_added, use_no_coref_list)
        resolved_utterances = []
        for utt in self.get_utterances(last_utterances_added=last_utterances_added):
            resolved_utt = ""
            in_coref = None
            for token in utt:
                if in_coref is None:
                    for coref_original, coref_replace in coreferences.items():
                        if coref_original[0] == token:
                            in_coref = coref_original
                            resolved_utt += coref_replace.text.lower()
                            break
                    if in_coref is None:
                        resolved_utt += token.text_with_ws
                if in_coref is not None and token == in_coref[-1]:
                    resolved_utt += ' ' if token.whitespace_ and resolved_utt[-1] is not ' ' else ''
                    in_coref = None
            resolved_utterances.append(resolved_utt)
        return resolved_utterances

    def get_mentions(self):
        ''' Retrieve the list of mentions'''
        return self.data.mentions

    def get_scores(self):
        ''' Retrieve scores for single mentions and pair of mentions'''
        return {"single_scores": self.mentions_single_scores,
                "pair_scores": self.mentions_pairs_scores}

    def get_clusters(self, remove_singletons=True, use_no_coref_list=True):
        ''' Retrieve cleaned clusters'''
        clusters = self.clusters
        remove_id = []
        if use_no_coref_list:
            for key, mentions in clusters.items():
                cleaned_list = []
                for mention_idx in mentions:
                    mention = self.data.mentions[mention_idx]
                    if mention.lower_ not in NO_COREF_LIST:
                        cleaned_list.append(mention_idx)
                clusters[key] = cleaned_list
            # Also clean up keys so we can build coref chains in self.get_most_representative
            added = {}
            for key, mentions in clusters.items():
                if self.data.mentions[key].lower_ in NO_COREF_LIST:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
                    if mentions:
                        added[mentions[0]] = mentions
            for rem in remove_id:
                del clusters[rem]
            clusters.update(added)

        if remove_singletons:
            remove_id = []
            for key, mentions in clusters.items():
                if len(mentions) == 1:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
            for rem in remove_id:
                del clusters[rem]

        return clusters

    def get_most_representative(self, last_utterances_added=True, use_no_coref_list=True):
        '''
        Find a most representative mention for each cluster

        Return:
            Dictionnary of {original_mention: most_representative_resolved_mention, ...}
        '''
        clusters = self.get_clusters(remove_singletons=True, use_no_coref_list=use_no_coref_list)
        coreferences = {}
        for key in self.data.get_candidate_mentions(last_utterances_added=last_utterances_added):
            if self.mention_to_cluster[key] is None:
                continue
            mentions = clusters.get(self.mention_to_cluster[key], None)
            if mentions is None:
                continue
            representative = self.data.mentions[key]
            for mention_idx in mentions[1:]:
                mention = self.data.mentions[mention_idx]
                if mention.mention_type is not representative.mention_type:
                    if mention.mention_type == MENTION_TYPE["PROPER"] \
                        or (mention.mention_type == MENTION_TYPE["NOMINAL"] and
                                representative.mention_type == MENTION_TYPE["PRONOMINAL"]):
                        coreferences[self.data.mentions[key]] = mention
                        representative = mention

        return coreferences
Пример #6
0
class Coref:
    '''
    Main coreference resolution algorithm
    '''
    def __init__(self,
                 nlp=None,
                 spacy_model=SPACY_MODEL,
                 greedyness=GREEDYNESS,
                 max_dist=MAX_DIST,
                 max_dist_match=MAX_DIST_MATCH,
                 max_follow_up=MAX_FOLLOW_UP,
                 conll=None,
                 use_no_coref_list=True,
                 debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.max_follow_up = max_follow_up
        self.debug = debug

        print(
            f'PARAMS: greedyness: {greedyness}, max_dist: {max_dist}, '
            f'max_dist_match: {max_dist_match}, max_follow_up {max_follow_up}')

        if nlp is None:
            print("Loading spacy model")
            nlp = spacy.load(spacy_model)

        model_path = os.path.join(
            PACKAGE_DIRECTORY,
            "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp,
                         model_path=embed_model_path,
                         conll=conll,
                         use_no_coref_list=use_no_coref_list,
                         consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}

    ###################################
    #### ENTITY CLUSTERS FUNCTIONS ####
    ###################################

    def _prepare_clusters(self):
        '''
        Clean up and prepare one cluster for each mention
        '''
        self.mention_to_cluster = list(range(len(self.data.mentions)))
        self.clusters = dict((i, [i]) for i in self.mention_to_cluster)
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
        for mention in self.mention_to_cluster:
            self.mentions_single_scores[mention] = None
            self.mentions_single_features[mention] = None
            self.mentions_pairs_scores[mention] = {}
            self.mentions_pairs_features[mention] = {}

    def _merge_coreference_clusters(self, ant_idx, mention_idx):
        '''
        Merge two clusters together
        '''
        if self.mention_to_cluster[ant_idx] == self.mention_to_cluster[
                mention_idx]:
            return

        remove_id = self.mention_to_cluster[ant_idx]
        keep_id = self.mention_to_cluster[mention_idx]
        for idx in self.clusters[remove_id]:
            self.mention_to_cluster[idx] = keep_id
            self.clusters[keep_id].append(idx)

        del self.clusters[remove_id]

    def display_clusters(self):
        '''
        Print clusters informations
        '''
        print(self.clusters)
        for key, mentions in list(self.clusters.items()):
            print("cluster", key, "(",
                  ", ".join(str(self.data[m]) for m in mentions), ")")

    ###################################
    ####### MAIN COREF FUNCTIONS ######
    ###################################

    def run_coref_on_mentions(self, mentions):
        '''
        Run the coreference model on a mentions iterator or list
        '''
        best_ant = {}
        n_ant = 0
        for mention_idx, ant_list in self.data.get_candidate_pairs(
                mentions, self.max_dist, self.max_dist_match):
            mention = self.data[mention_idx]
            feats_, ana_feats = self.data.get_single_mention_features(mention)
            anaphoricity_score = self.coref_model.get_single_mention_score(
                mention.embedding, ana_feats)
            self.mentions_single_scores[mention_idx] = anaphoricity_score
            self.mentions_single_features[mention_idx] = {
                "spansEmbeddings": mention.spans_embeddings_,
                "wordsEmbeddings": mention.words_embeddings_,
                "features": feats_
            }

            best_score = anaphoricity_score - 50 * (self.greedyness - 0.5)
            for ant_idx in ant_list:
                antecedent = self.data[ant_idx]
                feats_, pwf = self.data.get_pair_mentions_features(
                    antecedent, mention)
                score = self.coref_model.get_pair_mentions_score(
                    antecedent, mention, pwf)
                self.mentions_pairs_scores[mention_idx][ant_idx] = score
                self.mentions_pairs_features[mention_idx][ant_idx] = {
                    "pairFeatures": feats_,
                    "antecedentSpansEmbeddings": antecedent.spans_embeddings_,
                    "antecedentWordsEmbeddings": antecedent.words_embeddings_,
                    "mentionSpansEmbeddings": mention.spans_embeddings_,
                    "mentionWordsEmbeddings": mention.words_embeddings_
                }

                if score > best_score:
                    best_score = score
                    best_ant[mention_idx] = ant_idx
            if mention_idx in best_ant:
                n_ant += 1
                self._merge_coreference_clusters(best_ant[mention_idx],
                                                 mention_idx)
        return (n_ant, best_ant)

    def run_coref_on_utterances(self,
                                last_utterances_added=False,
                                follow_chains=True):
        ''' Run the coreference model on some utterances

        Arg:
            last_utterances_added: run the coreference model over the last utterances added to the data
            follow_chains: follow coreference chains over previous utterances
        '''
        self._prepare_clusters()
        mentions = list(
            self.data.get_candidate_mentions(
                last_utterances_added=last_utterances_added))
        n_ant, antecedents = self.run_coref_on_mentions(mentions)
        mentions = list(antecedents.values())
        if follow_chains and n_ant > 0:
            i = 0
            while i < self.max_follow_up:
                i += 1
                n_ant, antecedents = self.run_coref_on_mentions(mentions)
                mentions = list(antecedents.values())
                if n_ant == 0:
                    break

    def one_shot_coref(self,
                       utterances,
                       utterances_speakers_id=None,
                       context=None,
                       context_speakers_id=None,
                       speakers_names=None):
        ''' Clear history, load a list of utterances and an optional context and run the coreference model on them

        Arg:
        - `utterances` : iterator or list of string corresponding to successive utterances (in a dialogue) or sentences.
            Can be a single string for non-dialogue text.
        - `utterances_speakers_id=None` : iterator or list of speaker id for each utterance (in the case of a dialogue).
            - if not provided, assume two speakers speaking alternatively.
            - if utterances and utterances_speaker are not of the same length padded with None
        - `context=None` : iterator or list of string corresponding to additionnal utterances/sentences sent prior to `utterances`. Coreferences are not computed for the mentions identified in `context`. The mentions in `context` are only used as possible antecedents to mentions in `uterrance`. Reduce the computations when we are only interested in resolving coreference in the last sentences/utterances.
        - `context_speakers_id=None` : same as `utterances_speakers_id` for `context`. 
        - `speakers_names=None` : dictionnary of list of acceptable speaker names (strings) for speaker_id in `utterances_speakers_id` and `context_speakers_id`
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.set_utterances(context, context_speakers_id, speakers_names)
        self.continuous_coref(utterances, utterances_speakers_id,
                              speakers_names)
        return self.get_clusters()

    def continuous_coref(self,
                         utterances,
                         utterances_speakers_id=None,
                         speakers_names=None):
        '''
        Only resolve coreferences for the mentions in the utterances
        (but use the mentions in previously loaded utterances as possible antecedents)
        Arg:
            utterances : iterator or list of string corresponding to successive utterances
            utterances_speaker : iterator or list of speaker id for each utterance.
                If not provided, assume two speakers speaking alternatively.
                if utterances and utterances_speaker are not of the same length padded with None
            speakers_names : dictionnary of list of acceptable speaker names for each speaker id
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.add_utterances(utterances, utterances_speakers_id,
                                 speakers_names)
        self.run_coref_on_utterances(last_utterances_added=True,
                                     follow_chains=True)
        return self.get_clusters()

    ###################################
    ###### INFORMATION RETRIEVAL ######
    ###################################

    def get_utterances(self, last_utterances_added=True):
        ''' Retrieve the list of parsed uterrances'''
        if last_utterances_added:
            return [
                self.data.utterances[idx]
                for idx in self.data.last_utterances_loaded
            ]
        else:
            return self.data.utterances

    def get_resolved_utterances(self,
                                last_utterances_added=True,
                                use_no_coref_list=True):
        ''' Return a list of utterrances text where the '''
        coreferences = self.get_most_representative(last_utterances_added,
                                                    use_no_coref_list)
        resolved_utterances = []
        for utt in self.get_utterances(
                last_utterances_added=last_utterances_added):
            resolved_utt = ""
            in_coref = None
            for token in utt:
                if in_coref is None:
                    for coref_original, coref_replace in list(
                            coreferences.items()):
                        if coref_original[0] == token:
                            in_coref = coref_original
                            resolved_utt += coref_replace.text.lower()
                            break
                    if in_coref is None:
                        resolved_utt += token.text_with_ws
                if in_coref is not None and token == in_coref[-1]:
                    resolved_utt += ' ' if token.whitespace_ and resolved_utt[
                        -1] is not ' ' else ''
                    in_coref = None
            resolved_utterances.append(resolved_utt)
        return resolved_utterances

    def get_mentions(self):
        ''' Retrieve the list of mentions'''
        return self.data.mentions

    def get_scores(self):
        ''' Retrieve scores for single mentions and pair of mentions'''
        return {
            "single_scores": self.mentions_single_scores,
            "pair_scores": self.mentions_pairs_scores
        }

    def get_clusters(self, remove_singletons=True, use_no_coref_list=True):
        ''' Retrieve cleaned clusters'''
        clusters = self.clusters
        remove_id = []
        if use_no_coref_list:
            for key, mentions in list(clusters.items()):
                cleaned_list = []
                for mention_idx in mentions:
                    mention = self.data.mentions[mention_idx]
                    if mention.lower_ not in NO_COREF_LIST:
                        cleaned_list.append(mention_idx)
                clusters[key] = cleaned_list
            # Also clean up keys so we can build coref chains in self.get_most_representative
            added = {}
            for key, mentions in list(clusters.items()):
                if self.data.mentions[key].lower_ in NO_COREF_LIST:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
                    if mentions:
                        added[mentions[0]] = mentions
            for rem in remove_id:
                del clusters[rem]
            clusters.update(added)

        if remove_singletons:
            remove_id = []
            for key, mentions in list(clusters.items()):
                if len(mentions) == 1:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
            for rem in remove_id:
                del clusters[rem]

        return clusters

    def get_most_representative(self,
                                last_utterances_added=True,
                                use_no_coref_list=True):
        '''
        Find a most representative mention for each cluster

        Return:
            Dictionnary of {original_mention: most_representative_resolved_mention, ...}
        '''
        clusters = self.get_clusters(remove_singletons=True,
                                     use_no_coref_list=use_no_coref_list)
        coreferences = {}
        for key in self.data.get_candidate_mentions(
                last_utterances_added=last_utterances_added):
            if self.mention_to_cluster[key] is None:
                continue
            mentions = clusters.get(self.mention_to_cluster[key], None)
            if mentions is None:
                continue
            representative = self.data.mentions[key]
            for mention_idx in mentions[1:]:
                mention = self.data.mentions[mention_idx]
                if mention.mention_type is not representative.mention_type:
                    if mention.mention_type == MENTION_TYPE["PROPER"] \
                            or (mention.mention_type == MENTION_TYPE["NOMINAL"] and
                                representative.mention_type == MENTION_TYPE[
                                    "PRONOMINAL"]):
                        coreferences[self.data.mentions[key]] = mention
                        representative = mention

        return coreferences