Python Data примеры использования

Язык программирования: Python

Пространство имен/Пакет: neuralcoref.data

Класс/Тип: Data

Примеров на hotexamples.com: 6

Python Data - 6 примеров найдено. Это лучшие примеры Python кода для neuralcoref.data.Data, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Data(2)

add_utterances(2)

get_candidate_mentions(2)

get_candidate_pairs(2)

get_pair_mentions_features(2)

get_single_mention_features(2)

set_utterances(2)

Пример #1

Показать файл

    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}

Пример #2

Показать файл

Файл: algorithm.py Проект: lxwithgod/neuralcoref

    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}

Пример #3

Показать файл

Файл: algorithm.py Проект: unendin/neuralcoref

    def __init__(self,
                 nlp=None,
                 spacy_model=SPACY_MODEL,
                 greedyness=GREEDYNESS,
                 max_dist=MAX_DIST,
                 max_dist_match=MAX_DIST_MATCH,
                 max_follow_up=MAX_FOLLOW_UP,
                 conll=None,
                 use_no_coref_list=True,
                 debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.max_follow_up = max_follow_up
        self.debug = debug

        print(
            f'PARAMS: greedyness: {greedyness}, max_dist: {max_dist}, '
            f'max_dist_match: {max_dist_match}, max_follow_up {max_follow_up}')

        if nlp is None:
            print("Loading spacy model")
            nlp = spacy.load(spacy_model)

        model_path = os.path.join(
            PACKAGE_DIRECTORY,
            "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp,
                         model_path=embed_model_path,
                         conll=conll,
                         use_no_coref_list=use_no_coref_list,
                         consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}

Пример #4

Показать файл

class Coref:
    '''
    Main coreference resolution algorithm
    '''
    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}

    ###################################
    #### ENTITY CLUSTERS FUNCTIONS ####
    ###################################

    def _prepare_clusters(self):
        '''
        Clean up and prepare one cluster for each mention
        '''
        self.mention_to_cluster = list(range(len(self.data.mentions)))
        self.clusters = dict((i, [i]) for i in self.mention_to_cluster)
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
        for mention in self.mention_to_cluster:
            self.mentions_single_scores[mention] = None
            self.mentions_single_features[mention] = None
            self.mentions_pairs_scores[mention] = {}
            self.mentions_pairs_features[mention] = {}

    def _merge_coreference_clusters(self, ant_idx, mention_idx):
        '''
        Merge two clusters together
        '''
        if self.mention_to_cluster[ant_idx] == self.mention_to_cluster[mention_idx]:
            return

        remove_id = self.mention_to_cluster[ant_idx]
        keep_id = self.mention_to_cluster[mention_idx]
        for idx in self.clusters[remove_id]:
            self.mention_to_cluster[idx] = keep_id
            self.clusters[keep_id].append(idx)

        del self.clusters[remove_id]

    def display_clusters(self):
        '''
        Print clusters informations
        '''
        print(self.clusters)
        for key, mentions in self.clusters.items():
            print("cluster", key, "(", ", ".join(str(self.data[m]) for m in mentions), ")")

    ###################################
    ####### MAIN COREF FUNCTIONS ######
    ###################################

    def run_coref_on_mentions(self, mentions):
        '''
        Run the coreference model on a mentions iterator or list
        '''
        best_ant = {}
        n_ant = 0
        for mention_idx, ant_list in self.data.get_candidate_pairs(mentions, self.max_dist, self.max_dist_match):
            mention = self.data[mention_idx]
            feats_, ana_feats = self.data.get_single_mention_features(mention)
            anaphoricity_score = self.coref_model.get_single_mention_score(mention.embedding, ana_feats)
            self.mentions_single_scores[mention_idx] = anaphoricity_score
            self.mentions_single_features[mention_idx] = {"spansEmbeddings": mention.spans_embeddings_, "wordsEmbeddings": mention.words_embeddings_, "features": feats_}

            best_score = anaphoricity_score - 50 * (self.greedyness - 0.5)
            for ant_idx in ant_list:
                antecedent = self.data[ant_idx]
                feats_, pwf = self.data.get_pair_mentions_features(antecedent, mention)
                score = self.coref_model.get_pair_mentions_score(antecedent, mention, pwf)
                self.mentions_pairs_scores[mention_idx][ant_idx] = score
                self.mentions_pairs_features[mention_idx][ant_idx] = {"pairFeatures": feats_, "antecedentSpansEmbeddings": antecedent.spans_embeddings_,
                                                                      "antecedentWordsEmbeddings": antecedent.words_embeddings_,
                                                                      "mentionSpansEmbeddings": mention.spans_embeddings_,
                                                                      "mentionWordsEmbeddings": mention.words_embeddings_ }

                if score > best_score:
                    best_score = score
                    best_ant[mention_idx] = ant_idx
            if mention_idx in best_ant:
                n_ant += 1
                self._merge_coreference_clusters(best_ant[mention_idx], mention_idx)
        return (n_ant, best_ant)

    def run_coref_on_utterances(self, last_utterances_added=False, follow_chains=True):
        ''' Run the coreference model on some utterances

        Arg:
            last_utterances_added: run the coreference model over the last utterances added to the data
            follow_chains: follow coreference chains over previous utterances
        '''
        self._prepare_clusters()
        mentions = list(self.data.get_candidate_mentions(last_utterances_added=last_utterances_added))
        n_ant, antecedents = self.run_coref_on_mentions(mentions)
        mentions = antecedents.values()
        if follow_chains and n_ant > 0:
            i = 0
            while i < MAX_FOLLOW_UP:
                i += 1
                n_ant, antecedents = self.run_coref_on_mentions(mentions)
                mentions = antecedents.values()
                if n_ant == 0:
                    break

    def one_shot_coref(self, utterances, utterances_speakers_id=None, context=None,
                       context_speakers_id=None, speakers_names=None):
        ''' Clear history, load a list of utterances and run the coreference model on them

        Arg:
            utterances : iterator or list of string corresponding to successive utterances
            utterances_speaker : iterator or list of speaker id for each utterance.
                If not provided, assume two speakers speaking alternatively.
                if utterances and utterances_speaker are not of the same length padded with None
            context : same as utterances but coreferences are not computed for this,
                      only used as possible antecedent to utterances mentions
            context_speaker : same as utterances_speaker
            speakers_names : dictionnary of list of acceptable speaker names for each speaker id
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.set_utterances(context, context_speakers_id, speakers_names)
        self.continuous_coref(utterances, utterances_speakers_id, speakers_names)
        return self.get_clusters()

    def continuous_coref(self, utterances, utterances_speakers_id=None, speakers_names=None):
        '''
        Same as one-shot coref but don't clear the history.
        Only resolve coreferences for the mentions in the utterances
        (but use the mentions in previously loaded utterances as possible antecedents)
        '''
        self.data.add_utterances(utterances, utterances_speakers_id, speakers_names)
        self.run_coref_on_utterances(last_utterances_added=True, follow_chains=True)
        return self.get_clusters()

    ###################################
    ###### INFORMATION RETRIEVAL ######
    ###################################

    def get_scores(self):
        ''' Retrieve single and pair scores'''
        return {"single_scores": self.mentions_single_scores,
                "pair_scores": self.mentions_pairs_scores}

    def get_clusters(self, remove_singletons=True, use_no_coref_list=True):
        ''' Retrieve cleaned clusters'''
        clusters = self.clusters
        remove_id = []
        if use_no_coref_list:
            for key, mentions in clusters.items():
                cleaned_list = []
                for mention_idx in mentions:
                    mention = self.data.mentions[mention_idx]
                    if mention.lower_ in NO_COREF_LIST:
                        cleaned_list.append(mention_idx)
                        self.mention_to_cluster[mention_idx] = None
                clusters[key] = cleaned_list
            # Also clean up keys so we can build coref chains in self.get_most_representative
            added = {}
            for key, mentions in clusters.items():
                if self.data.mentions[key].lower_ in NO_COREF_LIST:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
                    if mentions:
                        added[mentions[0]] = mentions
            clusters.update(added)
        if remove_singletons:
            for key, mentions in clusters.items():
                if len(mentions) == 1:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
        for rem in remove_id:
            del clusters[rem]

        return clusters

    def get_most_representative(self, last_utterances_added=True, use_no_coref_list=True):
        '''
        Find a most representative mention for each cluster

        Return:
            Dictionnary of {original_mention: most_representative_resolved_mention, ...}
        '''
        clusters = self.get_clusters(remove_singletons=True, use_no_coref_list=use_no_coref_list)
        coreferences = {}
        for key in self.data.get_candidate_mentions(last_utterances_added=last_utterances_added):
            if self.mention_to_cluster[key] is None:
                continue
            mentions = clusters.get(self.mention_to_cluster[key], None)
            if mentions is None:
                continue
            representative = self.data.mentions[key]
            for mention_idx in mentions[1:]:
                mention = self.data.mentions[mention_idx]
                if mention.mention_type is not representative.mention_type:
                    if mention.mention_type == MENTION_TYPE["PROPER"] \
                        or (mention.mention_type == MENTION_TYPE["NOMINAL"] and
                                representative.mention_type == MENTION_TYPE["PRONOMINAL"]):
                        coreferences[self.data.mentions[key]] = mention
                        representative = mention

        return coreferences

Пример #5

Показать файл

Файл: algorithm.py Проект: lxwithgod/neuralcoref

class Coref:
    '''
    Main coreference resolution algorithm
    '''
    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}

    ###################################
    #### ENTITY CLUSTERS FUNCTIONS ####
    ###################################

    def _prepare_clusters(self):
        '''
        Clean up and prepare one cluster for each mention
        '''
        self.mention_to_cluster = list(range(len(self.data.mentions)))
        self.clusters = dict((i, [i]) for i in self.mention_to_cluster)
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
        for mention in self.mention_to_cluster:
            self.mentions_single_scores[mention] = None
            self.mentions_single_features[mention] = None
            self.mentions_pairs_scores[mention] = {}
            self.mentions_pairs_features[mention] = {}

    def _merge_coreference_clusters(self, ant_idx, mention_idx):
        '''
        Merge two clusters together
        '''
        if self.mention_to_cluster[ant_idx] == self.mention_to_cluster[mention_idx]:
            return

        remove_id = self.mention_to_cluster[ant_idx]
        keep_id = self.mention_to_cluster[mention_idx]
        for idx in self.clusters[remove_id]:
            self.mention_to_cluster[idx] = keep_id
            self.clusters[keep_id].append(idx)

        del self.clusters[remove_id]

    def display_clusters(self):
        '''
        Print clusters informations
        '''
        print(self.clusters)
        for key, mentions in self.clusters.items():
            print("cluster", key, "(", ", ".join(str(self.data[m]) for m in mentions), ")")

    ###################################
    ####### MAIN COREF FUNCTIONS ######
    ###################################

    def run_coref_on_mentions(self, mentions):
        '''
        Run the coreference model on a mentions iterator or list
        '''
        best_ant = {}
        n_ant = 0
        for mention_idx, ant_list in self.data.get_candidate_pairs(mentions, self.max_dist, self.max_dist_match):
            mention = self.data[mention_idx]
            feats_, ana_feats = self.data.get_single_mention_features(mention)
            anaphoricity_score = self.coref_model.get_single_mention_score(mention.embedding, ana_feats)
            self.mentions_single_scores[mention_idx] = anaphoricity_score
            self.mentions_single_features[mention_idx] = {"spansEmbeddings": mention.spans_embeddings_, "wordsEmbeddings": mention.words_embeddings_, "features": feats_}

            best_score = anaphoricity_score - 50 * (self.greedyness - 0.5)
            for ant_idx in ant_list:
                antecedent = self.data[ant_idx]
                feats_, pwf = self.data.get_pair_mentions_features(antecedent, mention)
                score = self.coref_model.get_pair_mentions_score(antecedent, mention, pwf)
                self.mentions_pairs_scores[mention_idx][ant_idx] = score
                self.mentions_pairs_features[mention_idx][ant_idx] = {"pairFeatures": feats_, "antecedentSpansEmbeddings": antecedent.spans_embeddings_,
                                                                      "antecedentWordsEmbeddings": antecedent.words_embeddings_,
                                                                      "mentionSpansEmbeddings": mention.spans_embeddings_,
                                                                      "mentionWordsEmbeddings": mention.words_embeddings_ }

                if score > best_score:
                    best_score = score
                    best_ant[mention_idx] = ant_idx
            if mention_idx in best_ant:
                n_ant += 1
                self._merge_coreference_clusters(best_ant[mention_idx], mention_idx)
        return (n_ant, best_ant)

    def run_coref_on_utterances(self, last_utterances_added=False, follow_chains=True):
        ''' Run the coreference model on some utterances

        Arg:
            last_utterances_added: run the coreference model over the last utterances added to the data
            follow_chains: follow coreference chains over previous utterances
        '''
        self._prepare_clusters()
        mentions = list(self.data.get_candidate_mentions(last_utterances_added=last_utterances_added))
        n_ant, antecedents = self.run_coref_on_mentions(mentions)
        mentions = antecedents.values()
        if follow_chains and n_ant > 0:
            i = 0
            while i < MAX_FOLLOW_UP:
                i += 1
                n_ant, antecedents = self.run_coref_on_mentions(mentions)
                mentions = antecedents.values()
                if n_ant == 0:
                    break

    def one_shot_coref(self, utterances, utterances_speakers_id=None, context=None,
                       context_speakers_id=None, speakers_names=None):
        ''' Clear history, load a list of utterances and an optional context and run the coreference model on them

        Arg:
        - `utterances` : iterator or list of string corresponding to successive utterances (in a dialogue) or sentences.
            Can be a single string for non-dialogue text.
        - `utterances_speakers_id=None` : iterator or list of speaker id for each utterance (in the case of a dialogue).
            - if not provided, assume two speakers speaking alternatively.
            - if utterances and utterances_speaker are not of the same length padded with None
        - `context=None` : iterator or list of string corresponding to additionnal utterances/sentences sent prior to `utterances`. Coreferences are not computed for the mentions identified in `context`. The mentions in `context` are only used as possible antecedents to mentions in `uterrance`. Reduce the computations when we are only interested in resolving coreference in the last sentences/utterances.
        - `context_speakers_id=None` : same as `utterances_speakers_id` for `context`. 
        - `speakers_names=None` : dictionnary of list of acceptable speaker names (strings) for speaker_id in `utterances_speakers_id` and `context_speakers_id`
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.set_utterances(context, context_speakers_id, speakers_names)
        self.continuous_coref(utterances, utterances_speakers_id, speakers_names)
        return self.get_clusters()

    def continuous_coref(self, utterances, utterances_speakers_id=None, speakers_names=None):
        '''
        Only resolve coreferences for the mentions in the utterances
        (but use the mentions in previously loaded utterances as possible antecedents)
        Arg:
            utterances : iterator or list of string corresponding to successive utterances
            utterances_speaker : iterator or list of speaker id for each utterance.
                If not provided, assume two speakers speaking alternatively.
                if utterances and utterances_speaker are not of the same length padded with None
            speakers_names : dictionnary of list of acceptable speaker names for each speaker id
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.add_utterances(utterances, utterances_speakers_id, speakers_names)
        self.run_coref_on_utterances(last_utterances_added=True, follow_chains=True)
        return self.get_clusters()

    ###################################
    ###### INFORMATION RETRIEVAL ######
    ###################################

    def get_utterances(self, last_utterances_added=True):
        ''' Retrieve the list of parsed uterrances'''
        if last_utterances_added:
            return [self.data.utterances[idx] for idx in self.data.last_utterances_loaded]
        else:
            return self.data.utterances

    def get_resolved_utterances(self, last_utterances_added=True, use_no_coref_list=True):
        ''' Return a list of utterrances text where the '''
        coreferences = self.get_most_representative(last_utterances_added, use_no_coref_list)
        resolved_utterances = []
        for utt in self.get_utterances(last_utterances_added=last_utterances_added):
            resolved_utt = ""
            in_coref = None
            for token in utt:
                if in_coref is None:
                    for coref_original, coref_replace in coreferences.items():
                        if coref_original[0] == token:
                            in_coref = coref_original
                            resolved_utt += coref_replace.text.lower()
                            break
                    if in_coref is None:
                        resolved_utt += token.text_with_ws
                if in_coref is not None and token == in_coref[-1]:
                    resolved_utt += ' ' if token.whitespace_ and resolved_utt[-1] is not ' ' else ''
                    in_coref = None
            resolved_utterances.append(resolved_utt)
        return resolved_utterances

    def get_mentions(self):
        ''' Retrieve the list of mentions'''
        return self.data.mentions

    def get_scores(self):
        ''' Retrieve scores for single mentions and pair of mentions'''
        return {"single_scores": self.mentions_single_scores,
                "pair_scores": self.mentions_pairs_scores}

    def get_clusters(self, remove_singletons=True, use_no_coref_list=True):
        ''' Retrieve cleaned clusters'''
        clusters = self.clusters
        remove_id = []
        if use_no_coref_list:
            for key, mentions in clusters.items():
                cleaned_list = []
                for mention_idx in mentions:
                    mention = self.data.mentions[mention_idx]
                    if mention.lower_ not in NO_COREF_LIST:
                        cleaned_list.append(mention_idx)
                clusters[key] = cleaned_list
            # Also clean up keys so we can build coref chains in self.get_most_representative
            added = {}
            for key, mentions in clusters.items():
                if self.data.mentions[key].lower_ in NO_COREF_LIST:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
                    if mentions:
                        added[mentions[0]] = mentions
            for rem in remove_id:
                del clusters[rem]
            clusters.update(added)

        if remove_singletons:
            remove_id = []
            for key, mentions in clusters.items():
                if len(mentions) == 1:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
            for rem in remove_id:
                del clusters[rem]

        return clusters

    def get_most_representative(self, last_utterances_added=True, use_no_coref_list=True):
        '''
        Find a most representative mention for each cluster

        Return:
            Dictionnary of {original_mention: most_representative_resolved_mention, ...}
        '''
        clusters = self.get_clusters(remove_singletons=True, use_no_coref_list=use_no_coref_list)
        coreferences = {}
        for key in self.data.get_candidate_mentions(last_utterances_added=last_utterances_added):
            if self.mention_to_cluster[key] is None:
                continue
            mentions = clusters.get(self.mention_to_cluster[key], None)
            if mentions is None:
                continue
            representative = self.data.mentions[key]
            for mention_idx in mentions[1:]:
                mention = self.data.mentions[mention_idx]
                if mention.mention_type is not representative.mention_type:
                    if mention.mention_type == MENTION_TYPE["PROPER"] \
                        or (mention.mention_type == MENTION_TYPE["NOMINAL"] and
                                representative.mention_type == MENTION_TYPE["PRONOMINAL"]):
                        coreferences[self.data.mentions[key]] = mention
                        representative = mention

        return coreferences

Пример #6

Показать файл

Файл: algorithm.py Проект: unendin/neuralcoref

class Coref:
    '''
    Main coreference resolution algorithm
    '''
    def __init__(self,
                 nlp=None,
                 spacy_model=SPACY_MODEL,
                 greedyness=GREEDYNESS,
                 max_dist=MAX_DIST,
                 max_dist_match=MAX_DIST_MATCH,
                 max_follow_up=MAX_FOLLOW_UP,
                 conll=None,
                 use_no_coref_list=True,
                 debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.max_follow_up = max_follow_up
        self.debug = debug

        print(
            f'PARAMS: greedyness: {greedyness}, max_dist: {max_dist}, '
            f'max_dist_match: {max_dist_match}, max_follow_up {max_follow_up}')

        if nlp is None:
            print("Loading spacy model")
            nlp = spacy.load(spacy_model)

        model_path = os.path.join(
            PACKAGE_DIRECTORY,
            "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp,
                         model_path=embed_model_path,
                         conll=conll,
                         use_no_coref_list=use_no_coref_list,
                         consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}

    ###################################
    #### ENTITY CLUSTERS FUNCTIONS ####
    ###################################

    def _prepare_clusters(self):
        '''
        Clean up and prepare one cluster for each mention
        '''
        self.mention_to_cluster = list(range(len(self.data.mentions)))
        self.clusters = dict((i, [i]) for i in self.mention_to_cluster)
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
        for mention in self.mention_to_cluster:
            self.mentions_single_scores[mention] = None
            self.mentions_single_features[mention] = None
            self.mentions_pairs_scores[mention] = {}
            self.mentions_pairs_features[mention] = {}

    def _merge_coreference_clusters(self, ant_idx, mention_idx):
        '''
        Merge two clusters together
        '''
        if self.mention_to_cluster[ant_idx] == self.mention_to_cluster[
                mention_idx]:
            return

        remove_id = self.mention_to_cluster[ant_idx]
        keep_id = self.mention_to_cluster[mention_idx]
        for idx in self.clusters[remove_id]:
            self.mention_to_cluster[idx] = keep_id
            self.clusters[keep_id].append(idx)

        del self.clusters[remove_id]

    def display_clusters(self):
        '''
        Print clusters informations
        '''
        print(self.clusters)
        for key, mentions in list(self.clusters.items()):
            print("cluster", key, "(",
                  ", ".join(str(self.data[m]) for m in mentions), ")")

    ###################################
    ####### MAIN COREF FUNCTIONS ######
    ###################################

    def run_coref_on_mentions(self, mentions):
        '''
        Run the coreference model on a mentions iterator or list
        '''
        best_ant = {}
        n_ant = 0
        for mention_idx, ant_list in self.data.get_candidate_pairs(
                mentions, self.max_dist, self.max_dist_match):
            mention = self.data[mention_idx]
            feats_, ana_feats = self.data.get_single_mention_features(mention)
            anaphoricity_score = self.coref_model.get_single_mention_score(
                mention.embedding, ana_feats)
            self.mentions_single_scores[mention_idx] = anaphoricity_score
            self.mentions_single_features[mention_idx] = {
                "spansEmbeddings": mention.spans_embeddings_,
                "wordsEmbeddings": mention.words_embeddings_,
                "features": feats_
            }

            best_score = anaphoricity_score - 50 * (self.greedyness - 0.5)
            for ant_idx in ant_list:
                antecedent = self.data[ant_idx]
                feats_, pwf = self.data.get_pair_mentions_features(
                    antecedent, mention)
                score = self.coref_model.get_pair_mentions_score(
                    antecedent, mention, pwf)
                self.mentions_pairs_scores[mention_idx][ant_idx] = score
                self.mentions_pairs_features[mention_idx][ant_idx] = {
                    "pairFeatures": feats_,
                    "antecedentSpansEmbeddings": antecedent.spans_embeddings_,
                    "antecedentWordsEmbeddings": antecedent.words_embeddings_,
                    "mentionSpansEmbeddings": mention.spans_embeddings_,
                    "mentionWordsEmbeddings": mention.words_embeddings_
                }

                if score > best_score:
                    best_score = score
                    best_ant[mention_idx] = ant_idx
            if mention_idx in best_ant:
                n_ant += 1
                self._merge_coreference_clusters(best_ant[mention_idx],
                                                 mention_idx)
        return (n_ant, best_ant)

    def run_coref_on_utterances(self,
                                last_utterances_added=False,
                                follow_chains=True):
        ''' Run the coreference model on some utterances

        Arg:
            last_utterances_added: run the coreference model over the last utterances added to the data
            follow_chains: follow coreference chains over previous utterances
        '''
        self._prepare_clusters()
        mentions = list(
            self.data.get_candidate_mentions(
                last_utterances_added=last_utterances_added))
        n_ant, antecedents = self.run_coref_on_mentions(mentions)
        mentions = list(antecedents.values())
        if follow_chains and n_ant > 0:
            i = 0
            while i < self.max_follow_up:
                i += 1
                n_ant, antecedents = self.run_coref_on_mentions(mentions)
                mentions = list(antecedents.values())
                if n_ant == 0:
                    break

    def one_shot_coref(self,
                       utterances,
                       utterances_speakers_id=None,
                       context=None,
                       context_speakers_id=None,
                       speakers_names=None):
        ''' Clear history, load a list of utterances and an optional context and run the coreference model on them

        Arg:
        - `utterances` : iterator or list of string corresponding to successive utterances (in a dialogue) or sentences.
            Can be a single string for non-dialogue text.
        - `utterances_speakers_id=None` : iterator or list of speaker id for each utterance (in the case of a dialogue).
            - if not provided, assume two speakers speaking alternatively.
            - if utterances and utterances_speaker are not of the same length padded with None
        - `context=None` : iterator or list of string corresponding to additionnal utterances/sentences sent prior to `utterances`. Coreferences are not computed for the mentions identified in `context`. The mentions in `context` are only used as possible antecedents to mentions in `uterrance`. Reduce the computations when we are only interested in resolving coreference in the last sentences/utterances.
        - `context_speakers_id=None` : same as `utterances_speakers_id` for `context`. 
        - `speakers_names=None` : dictionnary of list of acceptable speaker names (strings) for speaker_id in `utterances_speakers_id` and `context_speakers_id`
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.set_utterances(context, context_speakers_id, speakers_names)
        self.continuous_coref(utterances, utterances_speakers_id,
                              speakers_names)
        return self.get_clusters()

    def continuous_coref(self,
                         utterances,
                         utterances_speakers_id=None,
                         speakers_names=None):
        '''
        Only resolve coreferences for the mentions in the utterances
        (but use the mentions in previously loaded utterances as possible antecedents)
        Arg:
            utterances : iterator or list of string corresponding to successive utterances
            utterances_speaker : iterator or list of speaker id for each utterance.
                If not provided, assume two speakers speaking alternatively.
                if utterances and utterances_speaker are not of the same length padded with None
            speakers_names : dictionnary of list of acceptable speaker names for each speaker id
        Return:
            clusters of entities with coreference resolved
        '''
        self.data.add_utterances(utterances, utterances_speakers_id,
                                 speakers_names)
        self.run_coref_on_utterances(last_utterances_added=True,
                                     follow_chains=True)
        return self.get_clusters()

    ###################################
    ###### INFORMATION RETRIEVAL ######
    ###################################

    def get_utterances(self, last_utterances_added=True):
        ''' Retrieve the list of parsed uterrances'''
        if last_utterances_added:
            return [
                self.data.utterances[idx]
                for idx in self.data.last_utterances_loaded
            ]
        else:
            return self.data.utterances

    def get_resolved_utterances(self,
                                last_utterances_added=True,
                                use_no_coref_list=True):
        ''' Return a list of utterrances text where the '''
        coreferences = self.get_most_representative(last_utterances_added,
                                                    use_no_coref_list)
        resolved_utterances = []
        for utt in self.get_utterances(
                last_utterances_added=last_utterances_added):
            resolved_utt = ""
            in_coref = None
            for token in utt:
                if in_coref is None:
                    for coref_original, coref_replace in list(
                            coreferences.items()):
                        if coref_original[0] == token:
                            in_coref = coref_original
                            resolved_utt += coref_replace.text.lower()
                            break
                    if in_coref is None:
                        resolved_utt += token.text_with_ws
                if in_coref is not None and token == in_coref[-1]:
                    resolved_utt += ' ' if token.whitespace_ and resolved_utt[
                        -1] is not ' ' else ''
                    in_coref = None
            resolved_utterances.append(resolved_utt)
        return resolved_utterances

    def get_mentions(self):
        ''' Retrieve the list of mentions'''
        return self.data.mentions

    def get_scores(self):
        ''' Retrieve scores for single mentions and pair of mentions'''
        return {
            "single_scores": self.mentions_single_scores,
            "pair_scores": self.mentions_pairs_scores
        }

    def get_clusters(self, remove_singletons=True, use_no_coref_list=True):
        ''' Retrieve cleaned clusters'''
        clusters = self.clusters
        remove_id = []
        if use_no_coref_list:
            for key, mentions in list(clusters.items()):
                cleaned_list = []
                for mention_idx in mentions:
                    mention = self.data.mentions[mention_idx]
                    if mention.lower_ not in NO_COREF_LIST:
                        cleaned_list.append(mention_idx)
                clusters[key] = cleaned_list
            # Also clean up keys so we can build coref chains in self.get_most_representative
            added = {}
            for key, mentions in list(clusters.items()):
                if self.data.mentions[key].lower_ in NO_COREF_LIST:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
                    if mentions:
                        added[mentions[0]] = mentions
            for rem in remove_id:
                del clusters[rem]
            clusters.update(added)

        if remove_singletons:
            remove_id = []
            for key, mentions in list(clusters.items()):
                if len(mentions) == 1:
                    remove_id.append(key)
                    self.mention_to_cluster[key] = None
            for rem in remove_id:
                del clusters[rem]

        return clusters

    def get_most_representative(self,
                                last_utterances_added=True,
                                use_no_coref_list=True):
        '''
        Find a most representative mention for each cluster

        Return:
            Dictionnary of {original_mention: most_representative_resolved_mention, ...}
        '''
        clusters = self.get_clusters(remove_singletons=True,
                                     use_no_coref_list=use_no_coref_list)
        coreferences = {}
        for key in self.data.get_candidate_mentions(
                last_utterances_added=last_utterances_added):
            if self.mention_to_cluster[key] is None:
                continue
            mentions = clusters.get(self.mention_to_cluster[key], None)
            if mentions is None:
                continue
            representative = self.data.mentions[key]
            for mention_idx in mentions[1:]:
                mention = self.data.mentions[mention_idx]
                if mention.mention_type is not representative.mention_type:
                    if mention.mention_type == MENTION_TYPE["PROPER"] \
                            or (mention.mention_type == MENTION_TYPE["NOMINAL"] and
                                representative.mention_type == MENTION_TYPE[
                                    "PRONOMINAL"]):
                        coreferences[self.data.mentions[key]] = mention
                        representative = mention

        return coreferences