示例#1
0
def get_weight_sentence(document):
    text = document.text
    theme = document.theme
    lang = document.language
    nlp = nlp_it
    if lang == 'italian':
        model = Summarizer(sentence_handler=SentenceHandler(language=Italian))
        args = get_words(text, nlp)
    else:
        model = Summarizer(sentence_handler=SentenceHandler(language=Russian))
        args = get_words_for_ru(text)
    document.args = str(args)
    document.save()
    sentences = sent_tokenize(text)
    if lang == 'italian':
        sorted_sentences = get_sorted_sentence(sentences, nlp, text, args,
                                               theme, lang)
    else:
        sorted_sentences = get_sorted_sentence_for_ru(sentences, text, args,
                                                      theme, lang)
    note = generate(sentences, sorted_sentences)
    note_with_ml = model(text)
    note_item = Note(document_id=document,
                     text=note,
                     text_for_algo="",
                     text_for_ml=note_with_ml)
    note_item.save()
示例#2
0
    def __init__(self,
                 model: str = 'bert-large-uncased',
                 custom_model: PreTrainedModel = None,
                 custom_tokenizer: PreTrainedTokenizer = None,
                 hidden: Union[List[int], int] = -2,
                 reduce_option: str = 'mean',
                 sentence_handler: SentenceHandler = SentenceHandler(),
                 random_state: int = 12345,
                 hidden_concat: bool = False):
        """
        This is the main Bert Summarizer class.

        :param model: This parameter is associated with the inherit string parameters from the transformers library.
        :param custom_model: If you have a pre-trained model, you can add the model class here.
        :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
        :param hidden: This signifies which layer of the BERT model you would like to use as embeddings.
        :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
        :param greedyness: associated with the neuralcoref library. Determines how greedy coref should be.
        :param language: Which language to use for training.
        :param random_state: The random state to reproduce summarizations.
        :param hidden_concat: Whether or not to concat multiple hidden layers.
        """
        super(Summarizer,
              self).__init__(model, custom_model, custom_tokenizer, hidden,
                             reduce_option, sentence_handler, random_state,
                             hidden_concat)
示例#3
0
    def __init__(self,
                 model: str = 'bert-large-uncased',
                 custom_model: PreTrainedModel = None,
                 custom_tokenizer: PreTrainedTokenizer = None,
                 hidden: Union[List[int], int] = -2,
                 reduce_option: str = 'mean',
                 sentence_handler: SentenceHandler = SentenceHandler(),
                 random_state: int = 12345,
                 hidden_concat: bool = False):
        """
        This is the parent Bert Summarizer model. New methods should implement this class.

        :param model: This parameter is associated with the inherit string parameters from the transformers library.
        :param custom_model: If you have a pre-trained model, you can add the model class here.
        :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
        :param hidden: This signifies which layer(s) of the BERT model you would like to use as embeddings.
        :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
        :param sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass.
        CoreferenceHandler instance
        :param random_state: The random state to reproduce summarizations.
        :param hidden_concat: Whether or not to concat multiple hidden layers.
        """
        np.random.seed(random_state)
        self.model = BertParent(model, custom_model, custom_tokenizer)
        self.hidden = hidden
        self.reduce_option = reduce_option
        self.sentence_handler = sentence_handler
        self.random_state = random_state
        self.hidden_concat = hidden_concat
    def __init__(self,
                 transformer_type: str = 'Bert',
                 transformer_model_key: str = 'bert-base-uncased',
                 transformer_tokenizer_key: str = None,
                 hidden: int = -2,
                 reduce_option: str = 'mean',
                 sentence_handler: SentenceHandler = SentenceHandler(),
                 random_state: int = 12345):

        try:
            self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer)
            self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer)
            self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer)
        except Exception as e:
            pass  # older transformer version

        model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type]
        model = model_clz.from_pretrained(transformer_model_key,
                                          output_hidden_states=True)

        tokenizer = tokenizer_clz.from_pretrained(
            transformer_tokenizer_key if transformer_tokenizer_key is not None
            else transformer_model_key)

        super().__init__(None, model, tokenizer, hidden, reduce_option,
                         sentence_handler, random_state)
示例#5
0
    def __init__(
        self,
        sentence_handler: SentenceHandler = SentenceHandler(language=English),
        transformer_type: str = 'Bert',
        transformer_model_key: str = 'bert-base-uncased',
        transformer_tokenizer_key: str = None,
        hidden: Union[List[int], int] = -2,
        reduce_option: str = 'mean',
        random_state: int = 12345,
        hidden_concat: bool = False
    ):

        try:
            self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer)
            self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer)
            self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer)
            self.MODEL_DICT['Bart'] = (BartModel, BartTokenizer)
            self.MODEL_DICT['Longformer'] = (LongformerModel, LongformerTokenizer)
        except Exception as e:
            pass  # older transformer version

        model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type]
        model = model_clz.from_pretrained(transformer_model_key, output_hidden_states=True)

        tokenizer = tokenizer_clz.from_pretrained(
            transformer_tokenizer_key if transformer_tokenizer_key is not None else transformer_model_key
        )

        super().__init__(
            sentence_handler, None, model, tokenizer, hidden, reduce_option, random_state, hidden_concat
        )
 def __init__(self,
              model='bert-large-uncased',
              custom_model: PreTrainedModel = None,
              custom_tokenizer: PreTrainedTokenizer = None,
              hidden: int = -2,
              reduce_option: str = 'mean',
              sentence_handler: SentenceHandler = SentenceHandler(),
              random_state: int = 12345):
     super(SingleModel, self).__init__(model=model,
                                       custom_model=custom_model,
                                       custom_tokenizer=custom_tokenizer,
                                       hidden=hidden,
                                       reduce_option=reduce_option,
                                       sentence_handler=sentence_handler,
                                       random_state=random_state)
示例#7
0
def PreProcessor(body, summary_length, min_length: int = 40):

    model = BertParent('bert-large-uncased')
    algorithm = 'kmeans'
    sentence_handler = SentenceHandler()
    random_state = 12345

    sentences = sentence_handler(body, min_length=40, max_length=600)
    print(len(sentences))

    if sentences:
        #...hidden contains n*1024 matrix as word embeddings returned by BERT model where n is number of sentences
        hidden = model(sentences)
        #...we call k-means algorithm and pass the word embeddings done by the BERT model
        hidden_args = cluster(hidden, algorithm, random_state, summary_length)
        sentences = [sentences[j] for j in hidden_args]

    return ' '.join(sentences)
示例#8
0
    def __init__(
        self,
        transformer_type: str = 'Bert',
        transformer_model_key: str = 'bert-base-uncased',
        transformer_tokenizer_key: str = None,
        hidden: Union[List[int], int] = -2,
        reduce_option: str = 'mean',
        sentence_handler: SentenceHandler = SentenceHandler(),
        random_state: int = 12345,
        hidden_concat: bool = False,
    ):
        """
        :param transformer_type: The Transformer type, such as Bert, GPT2, DistilBert, etc.
        :param transformer_model_key: The transformer model key. This is the directory for the model.
        :param transformer_tokenizer_key: The transformer tokenizer key. This is the tokenizer directory.
        :param hidden: The hidden output layers to use for the summarization.
        :param reduce_option: The reduce option, such as mean, max, min, median, etc.
        :param sentence_handler: The sentence handler class to process the raw text.
        :param random_state: The random state to use.
        :param hidden_concat: Deprecated hidden concat option.
        """
        try:
            self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer)
            self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer)
            self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer)
            self.MODEL_DICT['Bart'] = (BartModel, BartTokenizer)
            self.MODEL_DICT['Longformer'] = (LongformerModel,
                                             LongformerTokenizer)
        except Exception:
            pass  # older transformer version

        model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type]
        model = model_clz.from_pretrained(transformer_model_key,
                                          output_hidden_states=True)

        tokenizer = tokenizer_clz.from_pretrained(
            transformer_tokenizer_key if transformer_tokenizer_key is not None
            else transformer_model_key)

        super().__init__(None, model, tokenizer, hidden, reduce_option,
                         sentence_handler, random_state, hidden_concat)
def test_num_sentences(summarizer, passage):
    result = summarizer(passage, num_sentences=3)
    result_sents = SentenceHandler().process(result)
    assert len(result_sents) == 3
def sentence_handler():
    return SentenceHandler()