def dataset_to_input_instances(dataset: List[Tuple[List[str], List[str], str]]) -> List[InputInstance]: input_instances = [] for idx, (sent1, sent2, _) in enumerate(dataset): instance = InputInstance(id_=idx, sent1=web_tokenizer(sent1), sent2=web_tokenizer(sent2)) input_instances.append(instance) return input_instances
def add_document(self, text): text = self.pre_filter(text) sentences_str = [[ w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0 ] for s in list(split_multi(text)) if len(s.strip()) > 0] self.number_of_sentences += len(sentences_str) self.number_of_documents += 1 pos_text = 0 document_candidates = {} term_in_doc = {} sentences_obj = [] block_of_word_obj = [] sentence_obj_aux = [] for (sentence_id, sentence) in enumerate(sentences_str): sentence_obj_aux = [] block_of_word_obj = [] for (pos_sent, word) in enumerate(sentence): if len([ c for c in word if c in self.exclude ]) == len(word): # If the word is based on exclude chars if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) cand = ComposedWord(block_of_word_obj) cand = self.add_or_update_composed_word(cand) if cand.unique_kw not in document_candidates: document_candidates[cand.unique_kw] = cand block_of_word_obj = [] else: tag = self.get_tag(word, pos_sent) term_obj = self.get_term(word) term_in_doc[term_obj.unique_term] = term_obj term_obj.add_occurrence(tag, sentence_id, pos_sent, pos_text, self.number_of_documents) pos_text += 1 #Create co-occurrence matrix if tag not in self.tagsToDiscard: word_windows = list( range( max(0, len(block_of_word_obj) - self.windowsSize), len(block_of_word_obj))) for w in word_windows: if block_of_word_obj[w][ 0] not in self.tagsToDiscard: self.add_cooccurrence(block_of_word_obj[w][2], term_obj) # Add term to the block of words' buffer block_of_word_obj.append((tag, word, term_obj)) if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) if len(sentence_obj_aux) > 0: sentences_obj.append(sentence_obj_aux) if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) if len(sentence_obj_aux) > 0: sentences_obj.append(sentence_obj_aux) self.number_of_words += pos_text return document_candidates, term_in_doc
def __build_graph__(self): stopwords = get_stopwords(self.lan) stem = get_stem(self.lan).stem self.G = nx.Graph() sentences_str = [[ w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0 ] for s in list(split_multi(self.text)) if len(s.strip()) > 0] for sentence in sentences_str: buffer = [] for word in sentence: if len([ c for c in word if c in EXCLUDE ]) == len(word) or word.lower() in stopwords or word.replace( '.', '').replace(',', '').replace('-', '').isnumeric(): continue else: #stemmed_word = lemma(word).lower() stemmed_word = stem(word) if stemmed_word not in self.G: self.G.add_node(stemmed_word, TF=0) self.G.node[stemmed_word]['TF'] += 1 for (idx_cooccur, word_cooccur) in enumerate(buffer[-self.w:]): self.__add_cooccur__(word_cooccur, stemmed_word, idx_cooccur + 1) buffer.append(stemmed_word) self.__build_linegraph__()
def tokenize(sentence): words = web_tokenizer(sentence) #words = sentence.split(" ") out = [] for word in words: word = word.lower() if word != "_": out.append(word) return out
def _build(self, text, windowsSize, n): text = self.pre_filter(text) self.sentences_str = [ [w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0] for s in list(split_multi(text)) if len(s.strip()) > 0] self.number_of_sentences = len(self.sentences_str) pos_text = 0 block_of_word_obj = [] sentence_obj_aux = [] for (sentence_id, sentence) in enumerate(self.sentences_str): sentence_obj_aux = [] block_of_word_obj = [] for (pos_sent, word) in enumerate(sentence): if len([c for c in word if c in self.exclude]) == len(word): # If the word is based on exclude chars if len(block_of_word_obj) > 0: sentence_obj_aux.append( block_of_word_obj ) block_of_word_obj = [] else: tag = self.getTag(word, pos_sent) term_obj = self.getTerm(word) term_obj.addOccur(tag, sentence_id, pos_sent, pos_text) pos_text += 1 #Create co-occurrence matrix if tag not in self.tagsToDiscard: word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) )) for w in word_windows: if block_of_word_obj[w][0] not in self.tagsToDiscard: self.addCooccur(block_of_word_obj[w][2], term_obj) #Generate candidate keyphrase list candidate = [ (tag, word, term_obj) ] cand = composed_word(candidate) self.addOrUpdateComposedWord(cand) word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1] for w in word_windows: candidate.append(block_of_word_obj[w]) self.freq_ns[len(candidate)] += 1. cand = composed_word(candidate[::-1]) self.addOrUpdateComposedWord(cand) # Add term to the block of words' buffer block_of_word_obj.append( (tag, word, term_obj) ) if len(block_of_word_obj) > 0: sentence_obj_aux.append( block_of_word_obj ) if len(sentence_obj_aux) > 0: self.sentences_obj.append(sentence_obj_aux) if len(block_of_word_obj) > 0: sentence_obj_aux.append( block_of_word_obj ) if len(sentence_obj_aux) > 0: self.sentences_obj.append(sentence_obj_aux) self.number_of_words = pos_text
def build_candidate(self, candidate_string): sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0] candidate_terms = [] for (i, word) in enumerate(sentences_str): tag = self.getTag(word, i) term_obj = self.getTerm(word, save_non_seen=False) if term_obj.tf == 0: term_obj = None candidate_terms.append( (tag, word, term_obj) ) if len([cand for cand in candidate_terms if cand[2] != None]) == 0: invalid_virtual_cand = composed_word(None) return invalid_virtual_cand virtual_cand = composed_word(candidate_terms) return virtual_cand
def __nltk_stem__(self, word): return ' '.join([ self.stem.stem(w) for w in split_contractions(web_tokenizer(word)) ])
def __polish_stem__(self, word): return ' '.join( self.stem.stemmer_convert( [w for w in split_contractions(web_tokenizer(word))]))
def __simple_filter__(self, word): term = word.lower() for p in punctuation: term = term.replace(p, ' ') term = ' '.join([w for w in split_contractions(web_tokenizer(term))]) return term.strip()
def split_sentence(sentence_str): try: return tokenizer.web_tokenizer(sentence_str) except: return sentence_str.split()