def __init__(self, article, abstract_sentences, vocab, hps): self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len(article_words) self.enc_input = [vocab.word2id(w) for w in article_words] abstract = ' '.join(abstract_sentences) abstract_words = abstract.split() abs_ids = [vocab.word2id(w) for w in abstract_words] self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab, concept_vocab): start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) article = ' '.join(article) article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len(article_words) self.enc_input = [vocab.word2id(w) for w in article_words] abstract = ' '.join(abstract_sentences) abstract_words = abstract.split() abs_ids = [vocab.word2id(w) for w in abstract_words] self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) if config.pointer_gen: self.enc_input_extend_vocab, self.article_oovs, self.enc_input_concept_extend_vocab, self.concept_p, self.position, self.concept_mask = data.article2ids( article_words, vocab, concept_vocab) abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, ner_path, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # Process importance self.word_id_to_imp = data.get_word_id_to_importance(ner_path, vocab) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences,pos, vocab_in,vocab_out, config,types = None): self._config = config # Get ids of special tokens start_decoding = vocab_in.word2id(data.START_DECODING) stop_decoding = vocab_in.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab_in.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token if config.use_pos_tag: if pos==None: self.decode_pos, self.target_pos = [],[] self.enc_pos = [] else: pos_words = pos.split() if len(pos_words) > config.max_enc_steps: pos_words = pos_words[:config.max_enc_steps] assert len(pos_words)==len(article_words) #self.enc_pos = [vocab_in.tag2id[w] for w in pos_words] self.enc_pos = [vocab_out.vocab_tag.word2id(w) for w in pos_words] self.decode_pos, self.target_pos = self.get_dec_inp_targ_seqs(self.enc_pos, config.max_dec_steps,start_decoding, stop_decoding) if config.types: self.types = types # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab_out.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding,stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab_out) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab_out, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding,stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article #处理article,如果超过配置文件中的长度,截断 article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding #编码article, 包括oov单词也得跟着编码 self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings #编码abstract abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence #构造编码阶段的输入序列和输出序列 self.dec_input, _ = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves # 编码时需要输入原文编码和oov单词的编码 self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id # 获取参考摘要的id, 其中oov单词由原文中的oov单词编码表示 abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Get decoder target sequence # 目标编码和处理oov _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, title, article, tags, abstract_sentences, abstract_sentences_all, vocab, hps, stop_words): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) if hps.title_engaged or hps.title_guided: title_words = title.split() self.title_input = [vocab.word2id(w) for w in title_words[:hps.max_title_len]] self.title_len = len(self.title_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) if hps.co_occurrence or hps.prev_relation or hps.co_occurrence_h or hps.co_occurrence_i or (hps.coverage and hps.coverage_weighted) or hps.attention_weighted or hps.markov_attention or hps.markov_attention_contribution: self.cooccurrence_matrix, self.cooccurrence_weight = data.get_cooccurrence_matrix(self.enc_input_extend_vocab, win_size=hps.occurrence_window_size, exclude_words=stop_words, need_weight=(hps.co_occurrence_i or (hps.coverage and hps.coverage_weighted) or hps.attention_weighted or hps.markov_attention or hps.markov_attention_contribution), top_ten_kept=hps.top_ten_kept) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) if tags is not None: self.tags = tags[:self.enc_len] # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences self.original_abstract_sents_all = abstract_sentences_all
def __init__(self, article, abstract_sentences, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article = article.decode('utf-8') article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token twk = textrank.TextRankKeyword() twk.analyze(article_words, window_size=4, lower=False) self.word_rank = twk.makeword_rank() self.word_rank_data = [] for w in article_words: self.word_rank_data.append(self.word_rank[w]) # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, content_text, field_text, summary_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: content_text,source_text: source text; a string. each token is separated by a single space. summary_sentences: list of strings, one per summary sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING,field=False) stop_decoding = vocab.word2id(data.STOP_DECODING,field=False) # Process the content_text content_words = content_text.split() if len(content_words) > hps.max_enc_steps: content_words = content_words[:hps.max_enc_steps] self.enc_len_content = len(content_words) # store the length after truncation but before padding self.enc_input_content = [vocab.word2id(w,field=False) for w in content_words] # list of word ids; OOVs are represented by the id for UNK token # Process the field_text field_words = field_text.split() if len(field_words) > hps.max_enc_steps: field_words = field_words[:hps.max_enc_steps] self.enc_len_field = len(content_words) # store the length after truncation but before padding self.enc_input_field = [vocab.word2id(w,field=True) for w in field_words] # list of word ids; OOVs are represented by the id for UNK token assert self.enc_len_field == self.enc_len_content # Process the summary@@@@@@@@@ summary_text = ' '.join(summary_sentences) # string summary_words = summary_text.split() # list of strings summary_ids = [vocab.word2id(w,field = False) for w in summary_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(summary_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(content_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id sum_ids_extend_vocab = data.abstract2ids(summary_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(sum_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_content = content_text self.original_field = field_text self.original_summary = summary_text self.original_summary_sents = summary_sentences
def __init__(self, article, abstract_sens, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences. Args: article: source text; a string. each token is separated by a single space. abstract_sens: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.__hps = hps start_id = vocab.w2i(data.START_TOKEN) stop_id = vocab.w2i(data.STOP_TOKEN) # article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len(article_words) self.enc_input = [vocab.w2i(w) for w in article_words] # abstract abstract = ' '.join(abstract_sens) abstract_words = abstract.split() if len(abstract_words) > hps.max_dec_steps: abstract_words = abstract_words[:hps.max_dec_steps] # pointer generator ## Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id and the in-article OOVs words self.enc_input_ext_vocab, self.article_oovs = data.article2ids( article_words, vocab) ## Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids = [vocab.w2i(w) for w in abstract_words] self.dec_input, self.dec_target = self.__get_dec_input_target_seqs( abs_ids, hps.max_dec_steps, start_id, stop_id) self.dec_len = len(self.dec_input) abs_ids_ext_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) _, self.ext_dec_target = self.__get_dec_input_target_seqs( abs_ids_ext_vocab, hps.max_dec_steps, start_id, stop_id) # origin backup self.origin_article = article self.origin_abstract = abstract self.origin_abstract_sens = abstract_sens
def __init__(self, paragraph, question, answer, answer_positions, vocab, max_enc_steps, max_dec_steps, dynamic_vocab=False): self.dynamic_vocab = dynamic_vocab # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) paragraph_words = word_tokenize(paragraph) question_words = word_tokenize(question) answer_start_idx, answer_end_idx = answer_positions #assert ' '.join(paragraph_words[answer_start_idx:answer_end_idx]) == answer # Process the paragraph if len(paragraph_words) > max_enc_steps: if answer_end_idx <= max_enc_steps: paragraph_words = paragraph_words[:max_enc_steps] else: answer_mid_idx = (answer_start_idx + answer_end_idx) // 2 # assume len(answer_words) <= len(paragraph_words) paragraph_trunc_end = min(answer_mid_idx + max_enc_steps//2, len(paragraph_words)) paragraph_trunc_start = paragraph_trunc_end - max_enc_steps + 1 assert (paragraph_trunc_start <= answer_start_idx) and (paragraph_trunc_end >= answer_end_idx) paragraph_words = paragraph_words[paragraph_trunc_start:paragraph_trunc_end] answer_start_idx -= paragraph_trunc_start answer_end_idx -= paragraph_trunc_start self.enc_len = len(paragraph_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in paragraph_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract question_ids = [vocab.word2id(w) for w in question_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(question_ids, max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if self.dynamic_vocab: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.enc_oovs = data.article2ids(paragraph_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id question_ids_extend_vocab = data.abstract2ids(question_words, vocab, self.enc_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids self.dec_input_extend_vocab, self.target = self.get_dec_inp_targ_seqs(question_ids_extend_vocab, max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_paragraph = paragraph self.original_question = question self.original_answer = answer #' '.join(paragraph_words[answer_start_idx:answer_end_idx]) self.answer_start_idx = answer_start_idx self.answer_end_idx = answer_end_idx
def __init__(self, article, abstract_sentences, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) # START_DECODING = '[START]' stop_decoding = vocab.word2id(data.STOP_DECODING) # STOP_DECODING = '[STOP]' # Process the article article_words = article.split() if len(article_words) > config.max_enc_steps: # max_enc_steps=400 article_words = article_words[:config.max_enc_steps] # store the length after truncation but before padding self.enc_len = len(article_words) # list of word ids; OOVs are represented by the id for UNK token self.enc_input = [vocab.word2id(w) for w in article_words] # word --> id # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # pointer_gen=True """ Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves """ self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) """ Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id """ abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, content, query, summary, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article content_words = content.split() query_words = query.split() summary_words = summary.split() if len(content_words) > config.max_enc_steps: content_words = content_words[:config.max_enc_steps] self.enc_len = len( content_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in content_words ] # list of word ids; OOVs are represented by the id for UNK token self.query_enc_input = [vocab.word2id(w) for w in query_words] # Process the abstract summary_ids = [ vocab.word2id(w) for w in summary_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( summary_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.content_oovs = data.article2ids( content_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(summary_words, vocab, self.content_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_content = content self.original_query = query self.original_summary = summary
def __init__(self, article, abstract_sentence, vocab, config): # Get ids of special tokens start_decoding = vocab.word2id(data.BOS_WORD) stop_decoding = vocab.word2id(data.EOS_WORD) # Process the article # if article == 'nan': # article_words = [''] # else: article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract_words = abstract_sentence.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_seqs( abs_ids, config.max_dec_steps, start_decoding, stop_decoding) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings # self.original_article = article self.original_abstract = abstract_sentence
def __init__(self, article, abstract_sentences, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # 处理article,如果超过配置文件中的长度,截断。 article_words = list( article ) #*******************************************************************中文分词 if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding # 编码 article,包括oov单词也得跟着编码 self.enc_input = [vocab.word2id(w) for w in article_words] # 处理 abstract abstract = ' '.join(abstract_sentences) # string abstract_words = list(abstract) # list of strings # 编码 abstract abs_ids = [vocab.word2id(w) for w in abstract_words] # # 构建解码阶段的输入序列和输出序列“strat w1 w2”, "w1 w2 end",要一样长 self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # 如果使用pointer-generator模式, 需要一些额外信息 if config.pointer_gen: # 编码时需要输入原文编码和oov单词的编码 self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # 获取参考摘要的id,其中oov单词由原文中的oov单词编码表示 abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # 目标编码和处理oov _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # 存储原始数据 self.original_article = article self.original_abstract = abstract # 编码前的摘要,单词列表 self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract, vocab, hps): """ """ self.hps = hps start_id = vocab._word2id(data.DECODING_START) end_id = vocab._word2id(data.DECODING_END) article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len(article_words) self.enc_input = [vocab._word2id(w) for w in article_words] abstract_words = abstract.split() abs_ids = [vocab._word2id(w) for w in abstract_words] self.dec_input = [start_id] + abs_ids self.dec_target = abs_ids + [end_id] self.pad_id = vocab._word2id(data.PAD_TOKEN) if hps.pointer: self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) self.dec_target = abs_ids_extend_vocab + [end_id] while len(self.enc_input_extend_vocab) < hps.max_enc_steps: self.enc_input_extend_vocab.append(self.pad_id) while len(self.enc_input) < hps.max_enc_steps: self.enc_input.append(self.pad_id) while len(self.dec_input) < hps.max_dec_steps: self.dec_input.append(self.pad_id) self.dec_target.append(self.pad_id) if len(self.dec_input) > hps.max_dec_steps: self.dec_input = self.dec_input[:hps.max_dec_steps - 1] self.dec_target = self.dec_target[:hps.max_dec_steps - 1] self.dec_len = len(self.dec_input) self.original_article = article self.original_abstract = abstract
def __init__(self, article, abstract, vocab): # 开始、结束 ID start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len(article_words) # 编码 article,不在vocab的用UNK序号表示,拿这个做输入 self.enc_input = [vocab.word2id(w) for w in article_words] # 处理 abstract,不在vocab的用UNK序号表示 abstract_words = abstract.split() abs_ids = [vocab.word2id(w) for w in abstract_words] # 解码阶段的输入序列和输出序列 self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # 如果使用pointer-generator模式,enc_input中unk对应的0序号,会替换为词汇表长+oov词汇表内位置的序号 if config.pointer_gen: # 编码输入扩展了oov词的序列(unk的有了序号)和oov单词 self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # 获取参考摘要的id,其中oov单词由原文中的oov单词编码表示 abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # 新的目标序列,unk词有序号 _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # 存储原始数据 self.original_article = article self.original_abstract = abstract # 编码前的摘要,单词列表 self.original_abstract_sents = abstract
def __init__(self, article, abstract_sentences, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab, hps, log_path): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token if log_path is not None: # For testing reference_cluster_dir = os.path.join(log_path, "reference") if not os.path.exists(reference_cluster_dir): os.makedirs(reference_cluster_dir) # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract = run_coreference_resolution_for_testing( abstract, reference_cluster_dir) abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token else: # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract = run_coreference_resolution_for_training(abstract) abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) if hps.coreference_resolution: indices = [] antecedent = get_antecedent(abstract) if antecedent != None: for key in antecedent: positions = [ i for i, j in enumerate(abstract_words) if j == key ] if positions: positions = np.asarray(positions) closest_index = positions[( np.abs(positions - antecedent[key])).argmin()] indices.append(closest_index) idx = 0 if indices: for i in range(len(abs_ids_extend_vocab)): if i == indices[idx]: if idx < len(indices) - 1: idx += 1 else: continue else: abs_ids_extend_vocab[i] = 0 else: # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article_sentences, extract_ids, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Store the original strings self.original_article_sents = article_sentences self.original_extract_ids = extract_ids self.original_abstract_sents = abstract_sentences if hps.model in ['rewriter', 'end2end']: # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) if hps.model == 'rewriter': # Process the extracted sentences extract_sentences = [ article_sentences[idx] for idx in extract_ids ] enc_input_words = ' '.join(extract_sentences).split() else: # Process the article sentences enc_input_words = ' '.join(article_sentences).split() self.enc_input_sent_ids = [] for idx, sent in enumerate(article_sentences): sent_words = sent.split() for _ in range(len(sent_words)): if len(self.enc_input_sent_ids) < hps.max_enc_steps: self.art_len = idx + 1 self.enc_input_sent_ids.append(idx) if len(enc_input_words) > hps.max_enc_steps: enc_input_words = enc_input_words[:hps.max_enc_steps] self.enc_len = len( enc_input_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in enc_input_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( enc_input_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) if hps.model in ['selector', 'end2end']: # Process the article if hps.model == 'selector': if len(article_sentences) > hps.max_art_len: article_sentences = article_sentences[:hps.max_art_len] self.art_len = len( article_sentences ) # store the length after truncation but before padding elif hps.model == 'end2end': if self.art_len > hps.max_art_len: self.art_len = hps.max_art_len article_sentences = article_sentences[:self.art_len] self.art_ids = [] self.sent_lens = [] for sent in article_sentences: sent = sent.split() if len(sent) > hps.max_sent_len: sent = sent[:hps.max_sent_len] self.sent_lens.append(len(sent)) self.art_ids.append([vocab.word2id(w) for w in sent])
def __init__(self, article, abstract_sentences, vocab, hps, word_edge_list=None, query=None, query_edge_list=None, epoch_num=None, bert_vocab=None): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) self.bert_vocab = bert_vocab self.epoch_num = epoch_num #deprecated self.enc_pos_offset = None self.query_pos_offset = None # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps.value: article_words = article_words[:hps.max_enc_steps.value] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token #tf.logging.info(self.enc_len) if self.hps.use_elmo.value: self.enc_input_raw = article_words # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Process the query if hps.query_encoder.value: query_words = query.split() #query_words = word_features.get_tokens(query) if len(query_words) > hps.max_query_steps.value: #tf.logging.info('Before_query: %d Hps: %d'%(len(query_words),hps.max_query_steps.value)) query_words = query_words[len(query_words)- hps.max_query_steps.value:] #tf.logging.info('Big_query : %d'%(len(query_words))) query = " ".join(q for q in query_words) self.query_len = len(query_words) # store the length after truncation but before padding self.query_input = [vocab.word2id(w) for w in query_words] # list of word ids; OOVs are represented by the id for UNK token if self.hps.use_query_elmo.value: self.query_input_raw = query_words #tensorflow_hub requires raw text # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps.value, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen.value: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps.value, start_decoding, stop_decoding) if hps.word_gcn.value: self.word_edge_list = word_edge_list if hps.query_gcn.value: self.query_edge_list = query_edge_list if hps.use_bert.value: self.enc_input, self.enc_pos_offset = bert_vocab.convert_glove_to_bert_indices(self.enc_input) self.enc_len = len(self.enc_input) if hps.use_query_bert.value: self.query_input, self.query_pos_offset = bert_vocab.convert_glove_to_bert_indices(self.query_input) self.query_len = len(self.query_input) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences #if hps.query_encoder: self.original_query = query
def __init__(self, article, abstract, vocab, hps): """ Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract: reference summary; a string. each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING, None) stop_decoding = vocab.word2id(data.STOP_DECODING, None) # Process the article article_words = [data.parse_word(word) for word in article.split()] if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] # Store the length after truncation but before padding self.enc_len = len(article_words) # List of word ids; OOVs and entities are represented by ids less than data.N_FREE_TOKENS self.enc_input = [ vocab.word2id(w, word_type) for w, word_type in article_words ] # Process the abstract abstract_words = [data.parse_word(word) for word in abstract.split()] # List of word ids; OOVs and entities are represented by ids less than data.N_FREE_TOKENS abs_ids = [ vocab.word2id(w, word_type) for w, word_type in abstract_words ] # Get the decoder input sequence and target sequence with non-article specific ids. self.dec_input, target_orig = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # Store a version of the enc_input where in-article OOVs and entities are represented by # their temporary OOV id. Also store the in-article OOVs words themselves and a mapping # from temporary OOV ids to vocab ids. self.enc_input_extend_vocab, self.article_oovs, self.article_id_to_word_id = ( data.article2ids(article_words, vocab, hps.copy_only_entities)) # Get set of words that can be copied. if hps.copy_only_entities: # article_oovs only has entities copyable_words = set(self.article_oovs) else: copyable_words = set([w for w, word_type in article_words]) # Get a version of the reference summary where in-article OOVs are represented by their # temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs, copyable_words, hps.output_vocab_size) # Set decoder target sequence that uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Compute a mask for which tokens are people. people_tokens = { vocab.word2id('', token) for token in data.PERSON_TOKENS } self.target_people = [ float(token in people_tokens) for token in target_orig ] # Get list of people ids self.people_ids = [] for article_id, word_id in self.article_id_to_word_id.iteritems(): if word_id in people_tokens: self.people_ids.append(article_id) # Store the original strings self.original_article = article self.original_abstract = abstract
def __init__(self, article, abstract_sentences, article_id, sections, section_names, labels, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a list of strings. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. article_id: string sections: list of list of strings section_names: list of strings labels: list of strings, for extractive summarization training (TODO Later) vocab: Vocabulary object hps: hyperparameters """ self.hps = hps self.discard = False # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # clean section information # clean sections after conclusions if hps.hier: end_loc = len(section_names) beg_loc = 0 for i,s in enumerate(section_names): if 'conclu' in s.lower(): end_loc = i + 1 if 'intro' in s.lower() and beg_loc == 0: beg_loc = i if beg_loc < len(section_names) - end_loc: sections = sections[beg_loc:end_loc] try: intro_last = sections[beg_loc][-2:] # last two sentences in the intro except IndexError: # print('article_id: {}, len(sections): {}, section_names: {}'.format(article_id, len(sections), section_names)) self.discard = True return # intro_first = [] i = 0 # intro_last_len = _count_words(intro_last) # intro_len = intro_last_len # while(intro_len < hps.max_intro_len): # intro_first.append(sections[beg_loc][i]) # intro_len = _count_words(intro_first) + intro_last_len # i += 1 if not hps.split_intro: max_sents = hps.max_intro_sents - 2 # exclude the last two sents intro_first = sections[beg_loc][:max_sents] intro_last_words = _get_section_words(intro_last, pad=False) intro_last_len = len(intro_last_words) # flatten list of sents, get the string inside, count words discard_last = False if intro_last_len > hps.max_intro_len: discard_last = True len_limit = hps.max_intro_len - intro_last_len if not discard_last else hps.max_intro_len # truncate the intro by len_limit (we consider last 2 sentences from the intro to be there always) # Flatten list of lists, get the first element (string), get words, get first n words, return a striing, make it a list, extend it with intro_last intro_words = _get_section_words(intro_first, len_limit, pad=False) try: if intro_words[-1] != '.': intro_words = intro_words[:-1] + ['.'] if not discard_last: intro_words += intro_last_words intro_words = _pad_words(intro_words, hps.max_intro_len) except IndexError: print('No first section, Example discarded: ', article_id) self.discard = True else: intro_first = sections[beg_loc][:hps.max_intro_sents] intro_words = _get_section_words(intro_first, hps.max_intro_len, pad=True) try: conclusion_words = _get_section_words(sections[end_loc - beg_loc - 1][:hps.max_conclusion_sents], hps.max_conclusion_len) except: import pdb; pdb.set_trace() print("ERROR, pause and check") print('end_loc:', end_loc) print('section_names:', section_names) print('num_sections: {}'.format(len(sections))) print('len_sections_sents:', [len(e) for e in sections]) # if not hps.intro_split: article_sections = [_get_section_words(s[:hps.max_section_sents], hps.max_section_len) for s in sections[1:-1][:hps.num_sections - 2]] # else: # tmp_sections = [] # remaining_sec = sections[1:-1] # if len(remaining_sec) > hps.num_sections - 2: # for i in range(hps.num_sections - 2): # tmp_sections.append(remaining_sec[i]) # last_sec = [] # while(i < len(remaining_sec)): # last_sec.extend(remaining_sec[i]) # i += 1 # tmp_sections.append(last_sec) # remaining_sec = tmp_sections # # article_sections = [_get_section_words(s, hps.max_section_len) # for s in remaining_sec] sections = [intro_words] + article_sections + [conclusion_words] sec_len = len(sections) self.sec_len = sec_len self.num_words_section = [hps.max_section_len for e in sections] self.num_words_section_nopad = [len(e) for e in sections] # TODO: Assumption is that sections is a list of list (sections, sentences), check if assumption is true # TODO: Assumtpion is that number of sections is greater than 2, check if assumption is true # pad_id = vocab.word2id(data.PAD_TOKEN) article_text = ' '.join(article) # Process the article article_words = article_text.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] # store the length after truncation but before padding self.enc_len = len(article_words) # list of word ids; OOVs are represented by the id for UNK token self.enc_input = [vocab.word2id(w) for w in article_words] if hps.hier: self.enc_sections = [] for sec in sections: self.enc_sections.append([vocab.word2id(w) for w in sec]) self.enc_sec_len = [len(e) for e in self.enc_sections] # self.enc_sec_len = sec_len # TODO: Check # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings # list of word ids; OOVs are represented by the id for UNK token abs_ids = [vocab.word2id(w) for w in abstract_words] # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are # represented by their temporary OOV id; also store the in-article # OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are # represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids( abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV # ids, the target now includes words that are in the article but # not in the abstract, so represented as OOV _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) self.article_id = article_id self.sections = sections self.section_names = section_names self.labels = labels # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ #Leena change article and abstract_sentences here if you want to do a quick decoding test article = article.lower() self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() #Leena: Trying to uderstand the data pipeline # print("article_words: %s"%article_words) #word tokens; this would be used further in the code if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string #Leena: used this to save summariies in a file to train the LM # if hps.mode == 'train': # with open("/home/leena/Documents/thesis/pointer-gen/pointer-generator-master/nlm/train_data.txt", "a") as myfile: # myfile.write("\n%s\n"%abstract) # if hps.mode == 'eval': # with open("/home/leena/Documents/thesis/pointer-gen/pointer-generator-master/nlm/valid_data.txt", "a") as myfile: # myfile.write("\n%s\n"%abstract) # # if hps.mode == 'decode': # with open("/home/leena/Documents/thesis/pointer-gen/pointer-generator-master/nlm/test_data.txt", "a") as myfile: # myfile.write("\n%s\n"%abstract) abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. # example初始化,一个example是一个article-abstract对 Args: article: source text; a string. each token is separated by a single space. (src text,用string表示) abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.(abstract sentences列表,用string表示) vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens # 得到start<s>和stop</s>的id start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article # article string数组 article_words = article.split() # 如果超过则裁剪 if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] # padding之前的长度 self.enc_len = len( article_words ) # store the length after truncation but before padding # article word ids,oov表示为unk self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings # abstract word ids,oov表示为unk abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence # decoder的input是<s> + seq,target是seq + </s> self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) # dec input的长度(padding前) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves #如果定义了pointer gen,则得到扩展vocab(id list)和article中的oov(str list) self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id # 得到abstract的ids(extend版本) abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids # 覆盖target _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, reviews, ratings, answer_sentences, question, label, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: reviews: review text; a list. ratings: alist. answer_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the reviews self.r_lens = [] self.r_batch = [] self.rating_batch = ratings reviews_words = [] for review in reviews: review_words = review.split() if len(review_words) > hps.max_enc_steps: review_words = review_words[:hps.max_enc_steps] reviews_words.append(review_words) self.r_lens.append( len(review_words )) # store the length after truncation but before padding self.r_batch.append( [vocab.word2id(w) for w in review_words] ) # list of word ids; OOVs are represented by the id for UNK token # Process the abstract answer = ' '.join(answer_sentences) # string answer_words = answer.split() # list of strings ans_ids = [ vocab.word2id(w) for w in answer_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the question question_words = question.split() self.q_lens = len(question_words) self.q_batch = [vocab.word2id(w) for w in question_words] # Process the label self.y_target = label # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( ans_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) assert self.dec_len > 0 # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: self.oovs = [] # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.reviews_extend_vocab = [] for review_words in reviews_words: review_extend_vocab, self.oovs = data.article2ids( review_words, vocab, self.oovs) self.reviews_extend_vocab.append(review_extend_vocab) # question OOV id. self.question_extend_vocab, self.oovs = data.article2ids( question_words, vocab, self.oovs) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id ans_ids_extend_vocab = data.abstract2ids(answer_words, vocab, self.oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( ans_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_reviews = reviews self.original_answer = answer self.original_answer_sents = answer_sentences self.original_question = question
def __init__(self, article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, article_lcs_paths_list, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # # Process the article # article_words = article.split() # if len(article_words) > hps.max_enc_steps: # article_words = article_words[:hps.max_enc_steps] # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: if raw_article_sents is not None and len(raw_article_sents) > 0: # self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents] self.tokenized_sents = [ util.process_sent(sent, whitespace=True) for sent in raw_article_sents ] if self.hps.sep: for sent in self.tokenized_sents[:-1]: sent.append(data.SEP_TOKEN) # Process the article article_words = util.flatten_list_of_lists( self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids( self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists( self.word_ids_sents) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[: hps . max_enc_steps] self.enc_len = len( self.enc_input_extend_vocab ) # store the length after truncation but before padding else: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves article_str = util.to_unicode(article) raw_article_sents = nltk.tokenize.sent_tokenize(article_str) self.tokenized_sents = [ util.process_sent(sent) for sent in raw_article_sents ] # Process the article article_words = util.flatten_list_of_lists( self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids( self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists( self.word_ids_sents) # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[: hps . max_enc_steps] self.enc_len = len( self.enc_input_extend_vocab ) # store the length after truncation but before padding if self.hps.word_imp_reg: self.enc_importances = self.get_enc_importances( self.tokenized_sents, abstract_words) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) if ssi is not None: # Translate the similar source indices into masks over the encoder input self.ssi_masks = [] for source_indices in ssi: ssi_sent_mask = [0.] * len(raw_article_sents) for source_idx in source_indices: if source_idx >= len(ssi_sent_mask): a = 0 ssi_sent_mask[source_idx] = 1. ssi_mask = pg_mmr_functions.convert_to_word_level( ssi_sent_mask, self.tokenized_sents) self.ssi_masks.append(ssi_mask) summary_sent_tokens = [ sent.strip().split() for sent in abstract_sentences ] if self.hps.ssi_data_path is None and len( self.ssi_masks) != len(summary_sent_tokens): raise Exception( 'len(self.ssi_masks) != len(summary_sent_tokens)') self.sent_indices = pg_mmr_functions.convert_to_word_level( list(range(len(summary_sent_tokens))), summary_sent_tokens).tolist() if article_lcs_paths_list is not None: if len(article_lcs_paths_list) > 1: raise Exception('Need to implement for non-sent_dataset') article_lcs_paths = article_lcs_paths_list[0] imp_mask = [0] * len(article_words) to_add = 0 for source_idx, word_indices_list in enumerate(article_lcs_paths): if source_idx > 0: to_add += len(self.tokenized_sents[source_idx - 1]) for word_idx in word_indices_list: if word_idx + to_add >= len(imp_mask): if len(imp_mask) == hps.max_enc_steps: continue else: print(self.tokenized_sents, article_lcs_paths) raise Exception( 'word_idx + to_add (%d) is larger than imp_mask size (%d)' % (word_idx + to_add, len(imp_mask))) imp_mask[word_idx + to_add] = 1 self.importance_mask = imp_mask # Store the original strings self.original_article = article self.raw_article_sents = raw_article_sents self.original_abstract = abstract self.original_abstract_sents = abstract_sentences self.all_original_abstract_sents = all_abstract_sentences self.doc_indices = doc_indices self.ssi = ssi self.article_lcs_paths_list = article_lcs_paths_list
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article # process sent by viet nguyen sent_text = nltk.sent_tokenize(article) list_number_sent=[] article_words = [] for sentence in sent_text: split_sent = sentence.split() try: number_sent = int((split_sent[0][1:-1])) except ValueError: number_sent = 100 s = split_sent[1:] len_sent = len(s) for id in range(len_sent): list_number_sent.append(number_sent) article_words.append(s[id]) # article_words = article.split() # caculate tf word_dict = set(article_words) wordDictA = dict.fromkeys(word_dict, 0) for sentence in sent_text: split_sent = sentence.split() try: number_sent = int((split_sent[0][1:-1])) except ValueError: number_sent = 100 s = split_sent[1:] for word in s: wordDictA[word]+=1 tf_dict = {} tf_list=[] for word, count in wordDictA.items(): tf_dict[word] = count/float(len(article_words)) tf_list.append(tf_dict[word]) if len(tf_list) > hps.max_enc_steps: tf_list = tf_list[:hps.max_enc_steps] if len(list_number_sent) > hps.max_enc_steps: list_number_sent = list_number_sent[:hps.max_enc_steps] if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_tf_list = tf_list self.enc_number_sent = list_number_sent self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() # print ("Example __init__ article_words: ", len(article_words))# list of str # print ("__init__ hps.max_enc_steps: ", hps.max_enc_steps) # train flag if len(article_words) > hps.max_enc_steps.value: article_words = article_words[:hps.max_enc_steps.value] self.enc_len = len( article_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # print ("Example __init__ self.enc_len: ", self.enc_len) # int # print ("Example __init__ self.enc_input: ", len(self.enc_input)) # list of int # Process the abstract # print ("abstract_sentences: ", abstract_sentences) if hps.use_doc_vec.value: abstract_sentences_list = abstract_sentences[0].split() subred_tag = abstract_sentences_list[0] # print ("subred_tag: ", subred_tag) abstract_sentences = [' '.join(abstract_sentences_list[1:])] # print ("abstract_sentences: ", abstract_sentences) abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps.value, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info # print("__init__ hps.pointer_gen: ", hps.pointer_gen) # train flag if hps.pointer_gen.value: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids # print ("__init__ hps.max_dec_steps: ", hps.max_dec_steps) # train flag _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps.value, start_decoding, stop_decoding) # Store the original strings if hps.use_doc_vec.value: self.subred_tag = SUBRED_TABLE[subred_tag] self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article # Need to shuffle only the sentences within the hps.max_enc_steps. original_article_clean = article sentences = article.split('\n') if (hps.keep_stopwords < 1.0) or (hps.keep_word < 1.0): for i, sent in enumerate(sentences): sent_processed = [] for word in sent.split(' '): # Remove stopwords with specified probability if hps.keep_stopwords < 1.0: if (word.lower() in stopwords) and ( random.random() > hps.keep_stopwords): continue # Remove any word with specified probability. if hps.keep_word < 1.0: if (random.random() > hps.keep_word): continue sent_processed.append(word) sentences[i] = ' '.join(sent_processed) article = '\n'.join(sentences) if hps.shuffle_sentences: sentences = article.split('\n') token_counter = 0 for idx, sent in enumerate(sentences): token_counter += len(sent.split()) if token_counter >= hps.max_enc_steps: sentences[idx] = ' '.join(sent.split()[:hps.max_enc_steps - token_counter]) break sentences = sentences[:idx + 1] sentences = [ sent for sent in sentences if (sent != '\n' and sent != '') ] random.shuffle(sentences) article_words = ' '.join(sentences).split() else: article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len( article_words ) # store the length after truncation but before padding self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids( article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article_clean = original_article_clean self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
def get_sent_tokens(sent): words = sent.split() return data.abstract2ids(words, vocab, art_oovs)
def __init__(self, article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # # Process the article # article_words = article.split() # if len(article_words) > hps.max_enc_steps: # article_words = article_words[:hps.max_enc_steps] # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: if raw_article_sents is not None and len(raw_article_sents) > 0: # self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents] self.tokenized_sents = [util.process_sent(sent, whitespace=True) for sent in raw_article_sents] # Process the article article_words = util.flatten_list_of_lists(self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists(self.word_ids_sents) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[:hps.max_enc_steps] self.enc_len = len(self.enc_input_extend_vocab) # store the length after truncation but before padding else: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves article_str = util.to_unicode(article) raw_article_sents = nltk.tokenize.sent_tokenize(article_str) self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents] # Process the article article_words = util.flatten_list_of_lists(self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists(self.word_ids_sents) # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[:hps.max_enc_steps] self.enc_len = len(self.enc_input_extend_vocab) # store the length after truncation but before padding # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.raw_article_sents = raw_article_sents self.original_abstract = abstract self.original_abstract_sents = abstract_sentences self.all_original_abstract_sents = all_abstract_sentences self.doc_indices = doc_indices self.ssi = ssi