def get_weight_sentence(document): text = document.text theme = document.theme lang = document.language nlp = nlp_it if lang == 'italian': model = Summarizer(sentence_handler=SentenceHandler(language=Italian)) args = get_words(text, nlp) else: model = Summarizer(sentence_handler=SentenceHandler(language=Russian)) args = get_words_for_ru(text) document.args = str(args) document.save() sentences = sent_tokenize(text) if lang == 'italian': sorted_sentences = get_sorted_sentence(sentences, nlp, text, args, theme, lang) else: sorted_sentences = get_sorted_sentence_for_ru(sentences, text, args, theme, lang) note = generate(sentences, sorted_sentences) note_with_ml = model(text) note_item = Note(document_id=document, text=note, text_for_algo="", text_for_ml=note_with_ml) note_item.save()
def __init__(self, model: str = 'bert-large-uncased', custom_model: PreTrainedModel = None, custom_tokenizer: PreTrainedTokenizer = None, hidden: Union[List[int], int] = -2, reduce_option: str = 'mean', sentence_handler: SentenceHandler = SentenceHandler(), random_state: int = 12345, hidden_concat: bool = False): """ This is the main Bert Summarizer class. :param model: This parameter is associated with the inherit string parameters from the transformers library. :param custom_model: If you have a pre-trained model, you can add the model class here. :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here. :param hidden: This signifies which layer of the BERT model you would like to use as embeddings. :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results. :param greedyness: associated with the neuralcoref library. Determines how greedy coref should be. :param language: Which language to use for training. :param random_state: The random state to reproduce summarizations. :param hidden_concat: Whether or not to concat multiple hidden layers. """ super(Summarizer, self).__init__(model, custom_model, custom_tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat)
def __init__(self, model: str = 'bert-large-uncased', custom_model: PreTrainedModel = None, custom_tokenizer: PreTrainedTokenizer = None, hidden: Union[List[int], int] = -2, reduce_option: str = 'mean', sentence_handler: SentenceHandler = SentenceHandler(), random_state: int = 12345, hidden_concat: bool = False): """ This is the parent Bert Summarizer model. New methods should implement this class. :param model: This parameter is associated with the inherit string parameters from the transformers library. :param custom_model: If you have a pre-trained model, you can add the model class here. :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here. :param hidden: This signifies which layer(s) of the BERT model you would like to use as embeddings. :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results. :param sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass. CoreferenceHandler instance :param random_state: The random state to reproduce summarizations. :param hidden_concat: Whether or not to concat multiple hidden layers. """ np.random.seed(random_state) self.model = BertParent(model, custom_model, custom_tokenizer) self.hidden = hidden self.reduce_option = reduce_option self.sentence_handler = sentence_handler self.random_state = random_state self.hidden_concat = hidden_concat
def __init__(self, transformer_type: str = 'Bert', transformer_model_key: str = 'bert-base-uncased', transformer_tokenizer_key: str = None, hidden: int = -2, reduce_option: str = 'mean', sentence_handler: SentenceHandler = SentenceHandler(), random_state: int = 12345): try: self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer) self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer) self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer) except Exception as e: pass # older transformer version model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type] model = model_clz.from_pretrained(transformer_model_key, output_hidden_states=True) tokenizer = tokenizer_clz.from_pretrained( transformer_tokenizer_key if transformer_tokenizer_key is not None else transformer_model_key) super().__init__(None, model, tokenizer, hidden, reduce_option, sentence_handler, random_state)
def __init__( self, sentence_handler: SentenceHandler = SentenceHandler(language=English), transformer_type: str = 'Bert', transformer_model_key: str = 'bert-base-uncased', transformer_tokenizer_key: str = None, hidden: Union[List[int], int] = -2, reduce_option: str = 'mean', random_state: int = 12345, hidden_concat: bool = False ): try: self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer) self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer) self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer) self.MODEL_DICT['Bart'] = (BartModel, BartTokenizer) self.MODEL_DICT['Longformer'] = (LongformerModel, LongformerTokenizer) except Exception as e: pass # older transformer version model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type] model = model_clz.from_pretrained(transformer_model_key, output_hidden_states=True) tokenizer = tokenizer_clz.from_pretrained( transformer_tokenizer_key if transformer_tokenizer_key is not None else transformer_model_key ) super().__init__( sentence_handler, None, model, tokenizer, hidden, reduce_option, random_state, hidden_concat )
def __init__(self, model='bert-large-uncased', custom_model: PreTrainedModel = None, custom_tokenizer: PreTrainedTokenizer = None, hidden: int = -2, reduce_option: str = 'mean', sentence_handler: SentenceHandler = SentenceHandler(), random_state: int = 12345): super(SingleModel, self).__init__(model=model, custom_model=custom_model, custom_tokenizer=custom_tokenizer, hidden=hidden, reduce_option=reduce_option, sentence_handler=sentence_handler, random_state=random_state)
def PreProcessor(body, summary_length, min_length: int = 40): model = BertParent('bert-large-uncased') algorithm = 'kmeans' sentence_handler = SentenceHandler() random_state = 12345 sentences = sentence_handler(body, min_length=40, max_length=600) print(len(sentences)) if sentences: #...hidden contains n*1024 matrix as word embeddings returned by BERT model where n is number of sentences hidden = model(sentences) #...we call k-means algorithm and pass the word embeddings done by the BERT model hidden_args = cluster(hidden, algorithm, random_state, summary_length) sentences = [sentences[j] for j in hidden_args] return ' '.join(sentences)
def __init__( self, transformer_type: str = 'Bert', transformer_model_key: str = 'bert-base-uncased', transformer_tokenizer_key: str = None, hidden: Union[List[int], int] = -2, reduce_option: str = 'mean', sentence_handler: SentenceHandler = SentenceHandler(), random_state: int = 12345, hidden_concat: bool = False, ): """ :param transformer_type: The Transformer type, such as Bert, GPT2, DistilBert, etc. :param transformer_model_key: The transformer model key. This is the directory for the model. :param transformer_tokenizer_key: The transformer tokenizer key. This is the tokenizer directory. :param hidden: The hidden output layers to use for the summarization. :param reduce_option: The reduce option, such as mean, max, min, median, etc. :param sentence_handler: The sentence handler class to process the raw text. :param random_state: The random state to use. :param hidden_concat: Deprecated hidden concat option. """ try: self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer) self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer) self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer) self.MODEL_DICT['Bart'] = (BartModel, BartTokenizer) self.MODEL_DICT['Longformer'] = (LongformerModel, LongformerTokenizer) except Exception: pass # older transformer version model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type] model = model_clz.from_pretrained(transformer_model_key, output_hidden_states=True) tokenizer = tokenizer_clz.from_pretrained( transformer_tokenizer_key if transformer_tokenizer_key is not None else transformer_model_key) super().__init__(None, model, tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat)
def test_num_sentences(summarizer, passage): result = summarizer(passage, num_sentences=3) result_sents = SentenceHandler().process(result) assert len(result_sents) == 3
def sentence_handler(): return SentenceHandler()