예제 #1
0
    def __init__(self,
                 model: str = 'bert-large-uncased',
                 custom_model: PreTrainedModel = None,
                 custom_tokenizer: PreTrainedTokenizer = None,
                 hidden: Union[List[int], int] = -2,
                 reduce_option: str = 'mean',
                 sentence_handler: SentenceHandler = SentenceHandler(),
                 random_state: int = 12345,
                 hidden_concat: bool = False):
        """
        This is the parent Bert Summarizer model. New methods should implement this class.

        :param model: This parameter is associated with the inherit string parameters from the transformers library.
        :param custom_model: If you have a pre-trained model, you can add the model class here.
        :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
        :param hidden: This signifies which layer(s) of the BERT model you would like to use as embeddings.
        :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
        :param sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass.
        CoreferenceHandler instance
        :param random_state: The random state to reproduce summarizations.
        :param hidden_concat: Whether or not to concat multiple hidden layers.
        """
        np.random.seed(random_state)
        self.model = BertParent(model, custom_model, custom_tokenizer)
        self.hidden = hidden
        self.reduce_option = reduce_option
        self.sentence_handler = sentence_handler
        self.random_state = random_state
        self.hidden_concat = hidden_concat
    def __init__(self,
                 model: str = 'bert-large-uncased',
                 custom_model: PreTrainedModel = None,
                 custom_tokenizer: PreTrainedTokenizer = None,
                 hidden: int = -2,
                 reduce_option: str = 'mean',
                 greedyness: float = 0.45,
                 language=English,
                 random_state: int = 12345):
        """
        This is the parent Bert Summarizer model. New methods should implement this class

        :param model: This parameter is associated with the inherit string parameters from the transformers library.
        :param custom_model: If you have a pre-trained model, you can add the model class here.
        :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
        :param hidden: This signifies which layer of the BERT model you would like to use as embeddings.
        :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
        :param greedyness: associated with the neuralcoref library. Determines how greedy coref should be.
        :param language: Which language to use for training.
        :param random_state: The random state to reproduce summarizations.
        """

        np.random.seed(random_state)
        self.model = BertParent(model, custom_model, custom_tokenizer)
        self.hidden = hidden
        self.reduce_option = reduce_option
        self.nlp = language()
        self.random_state = random_state
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
        neuralcoref.add_to_pipe(self.nlp, greedyness=greedyness)
예제 #3
0
def PreProcessor(body, summary_length, min_length: int = 40):

    model = BertParent('bert-large-uncased')
    algorithm = 'kmeans'
    sentence_handler = SentenceHandler()
    random_state = 12345

    sentences = sentence_handler(body, min_length=40, max_length=600)
    print(len(sentences))

    if sentences:
        #...hidden contains n*1024 matrix as word embeddings returned by BERT model where n is number of sentences
        hidden = model(sentences)
        #...we call k-means algorithm and pass the word embeddings done by the BERT model
        hidden_args = cluster(hidden, algorithm, random_state, summary_length)
        sentences = [sentences[j] for j in hidden_args]

    return ' '.join(sentences)
예제 #4
0
def create_embedding_vector(centroid_words):
    model = BertParent('bert-large-uncased')
    hidden = model(centroid_words)
    return hidden