예제 #1
0
def dataset_to_input_instances(dataset: List[Tuple[List[str], List[str], str]]) -> List[InputInstance]:
    input_instances = []
    for idx, (sent1, sent2, _) in enumerate(dataset):
        instance = InputInstance(id_=idx, sent1=web_tokenizer(sent1), sent2=web_tokenizer(sent2))
        input_instances.append(instance)

    return input_instances
예제 #2
0
    def add_document(self, text):
        text = self.pre_filter(text)
        sentences_str = [[
            w for w in split_contractions(web_tokenizer(s))
            if not (w.startswith("'") and len(w) > 1) and len(w) > 0
        ] for s in list(split_multi(text)) if len(s.strip()) > 0]
        self.number_of_sentences += len(sentences_str)
        self.number_of_documents += 1
        pos_text = 0
        document_candidates = {}
        term_in_doc = {}
        sentences_obj = []
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([
                        c for c in word if c in self.exclude
                ]) == len(word):  # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append(block_of_word_obj)
                        cand = ComposedWord(block_of_word_obj)
                        cand = self.add_or_update_composed_word(cand)
                        if cand.unique_kw not in document_candidates:
                            document_candidates[cand.unique_kw] = cand
                        block_of_word_obj = []
                else:
                    tag = self.get_tag(word, pos_sent)
                    term_obj = self.get_term(word)
                    term_in_doc[term_obj.unique_term] = term_obj
                    term_obj.add_occurrence(tag, sentence_id, pos_sent,
                                            pos_text, self.number_of_documents)
                    pos_text += 1
                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(
                            range(
                                max(0,
                                    len(block_of_word_obj) - self.windowsSize),
                                len(block_of_word_obj)))
                        for w in word_windows:
                            if block_of_word_obj[w][
                                    0] not in self.tagsToDiscard:
                                self.add_cooccurrence(block_of_word_obj[w][2],
                                                      term_obj)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append((tag, word, term_obj))
            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append(block_of_word_obj)
            if len(sentence_obj_aux) > 0:
                sentences_obj.append(sentence_obj_aux)
        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append(block_of_word_obj)
        if len(sentence_obj_aux) > 0:
            sentences_obj.append(sentence_obj_aux)
        self.number_of_words += pos_text
        return document_candidates, term_in_doc
예제 #3
0
 def __build_graph__(self):
     stopwords = get_stopwords(self.lan)
     stem = get_stem(self.lan).stem
     self.G = nx.Graph()
     sentences_str = [[
         w for w in split_contractions(web_tokenizer(s))
         if not (w.startswith("'") and len(w) > 1) and len(w) > 0
     ] for s in list(split_multi(self.text)) if len(s.strip()) > 0]
     for sentence in sentences_str:
         buffer = []
         for word in sentence:
             if len([
                     c for c in word if c in EXCLUDE
             ]) == len(word) or word.lower() in stopwords or word.replace(
                     '.', '').replace(',', '').replace('-', '').isnumeric():
                 continue
             else:
                 #stemmed_word = lemma(word).lower()
                 stemmed_word = stem(word)
                 if stemmed_word not in self.G:
                     self.G.add_node(stemmed_word, TF=0)
                 self.G.node[stemmed_word]['TF'] += 1
                 for (idx_cooccur,
                      word_cooccur) in enumerate(buffer[-self.w:]):
                     self.__add_cooccur__(word_cooccur, stemmed_word,
                                          idx_cooccur + 1)
                 buffer.append(stemmed_word)
     self.__build_linegraph__()
예제 #4
0
def tokenize(sentence):
    words = web_tokenizer(sentence)
    #words = sentence.split(" ")
    out = []
    for word in words:
        word = word.lower()
        if word != "_":
            out.append(word)
    return out
예제 #5
0
    def _build(self, text, windowsSize, n):
        text = self.pre_filter(text)
        self.sentences_str = [ [w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0] for s in list(split_multi(text)) if len(s.strip()) > 0]
        self.number_of_sentences = len(self.sentences_str)
        pos_text = 0
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(self.sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([c for c in word if c in self.exclude]) == len(word): # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append( block_of_word_obj )
                        block_of_word_obj = []
                else:
                    tag = self.getTag(word, pos_sent)
                    term_obj = self.getTerm(word)
                    term_obj.addOccur(tag, sentence_id, pos_sent, pos_text)
                    pos_text += 1

                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard: 
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    #Generate candidate keyphrase list
                    candidate = [ (tag, word, term_obj) ]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append( (tag, word, term_obj) )

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append( block_of_word_obj )

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)

        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append( block_of_word_obj )

        if len(sentence_obj_aux) > 0:
            self.sentences_obj.append(sentence_obj_aux)

        self.number_of_words = pos_text
예제 #6
0
 def build_candidate(self, candidate_string):
     sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
     candidate_terms = []
     for (i, word) in enumerate(sentences_str):
         tag = self.getTag(word, i)
         term_obj = self.getTerm(word, save_non_seen=False)
         if term_obj.tf == 0:
             term_obj = None
         candidate_terms.append( (tag, word, term_obj) )
     if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
         invalid_virtual_cand = composed_word(None)
         return invalid_virtual_cand
     virtual_cand = composed_word(candidate_terms)
     return virtual_cand
예제 #7
0
 def __nltk_stem__(self, word):
     return ' '.join([
         self.stem.stem(w) for w in split_contractions(web_tokenizer(word))
     ])
예제 #8
0
 def __polish_stem__(self, word):
     return ' '.join(
         self.stem.stemmer_convert(
             [w for w in split_contractions(web_tokenizer(word))]))
예제 #9
0
 def __simple_filter__(self, word):
     term = word.lower()
     for p in punctuation:
         term = term.replace(p, ' ')
     term = ' '.join([w for w in split_contractions(web_tokenizer(term))])
     return term.strip()
예제 #10
0
def split_sentence(sentence_str):
    try:
        return tokenizer.web_tokenizer(sentence_str)
    except:
        return sentence_str.split()