def common_ngram_txt(self, tokens1, tokens2, size=15):
        print('Checking ngram length {}'.format(size))
        ng1 = set(nltk_ngrams(tokens1, size))
        ng2 = set(nltk_ngrams(tokens2, size))

        match = set.intersection(ng1, ng2)
        print('..found {}'.format(len(match)))

        return match
Пример #2
0
def extract_ngram_from_text(text,
                            n,
                            remove_stopwords=True,
                            remove_punc=True,
                            mode='spacy'):
    """
    Function that retrieves all n-grams from the input string
    :param text: raw string
    :param n: integer that tells the model to retrieve all k-gram where k<=n
    :param remove_stopwords: whether or not to remove stopwords from lib
    :param remove_punc: whether or not to remove punctuation from lib
    :param mode: {'spacy', 'naive'}
    :return ngram_counter: a counter that maps n-gram to its frequency
    :return tokens: a list of parsed ngrams
    """
    tokens = tokenize(text,
                      remove_stopwords=remove_stopwords,
                      remove_punc=remove_punc,
                      mode=mode)
    all_ngrams = []
    for i in range(1, n + 1):
        cur_ngrams = nltk_ngrams(tokens, i)
        all_ngrams += cur_ngrams
    ngram_counter = Counter(all_ngrams)
    return ngram_counter, all_ngrams
Пример #3
0
 def ngramize(self, o_tweet):
     ngram = []
     for i in self.ngram_combo:
         ngrams = nltk_ngrams(o_tweet.split(), i)
         for grams in ngrams:
             ngram.append(" ".join(grams))
     return ngram
Пример #4
0
    def append_data(self, unigrams):
        bigrams = [bigram[0].lower()+" "+bigram[1].lower() for bigram in nltk_ngrams(unigrams, 2)
                   if len(bigram[0]) > 1 and len(bigram[1]) > 1]

        self.ngrams += Counter(bigrams)
        self.prune(0.35)

        self.save()
Пример #5
0
def get_ngrams_around_anchors(n, words, anchors):
    """ Generate ngrams only around certain words (anchors). """
    all_ngrams = []
    for anchor in anchors:
        for i in anchor.get_all_occurrences():
            start_index = max(0, i - n + 1)
            piece = words[start_index:min(i + n, len(words))]
            ngrams = enumerate(nltk_ngrams(piece, n), start=start_index)
            all_ngrams.extend(ngrams)

    return all_ngrams
Пример #6
0
def get_ngrams_around_anchors(n, words, anchors):
    """ Generate ngrams only around certain words (anchors). """
    all_ngrams = []
    for anchor in anchors:
        for i in anchor.get_all_occurrences():
            start_index = max(0, i - n + 1)
            piece = words[start_index : min(i + n, len(words))]
            ngrams = enumerate(nltk_ngrams(piece, n), start=start_index)
            all_ngrams.extend(ngrams)

    return all_ngrams
Пример #7
0
    def append_data(self, unigrams):
        bigrams = [
            bigram[0].lower() + " " + bigram[1].lower()
            for bigram in nltk_ngrams(unigrams, 2)
            if len(bigram[0]) > 1 and len(bigram[1]) > 1
        ]

        self.ngrams += Counter(bigrams)
        self.prune(0.35)

        self.save()
Пример #8
0
 def transform(self, documents):
     repeated = []
     mirror = []
     repeated_phrase = []
     for doc in documents:
         repeated_count = 0
         mirror_count = 0
         repeated_phrase_count = 0
         stress_phrase_sents = doc.stress_markers
         # print(stress_phrase_sents)
         for i in range(0, len(stress_phrase_sents)):
             if i == 1 and stress_phrase_sents[i] == stress_phrase_sents[
                     i - 1] and len(stress_phrase_sents[i]) == 1:
                 repeated_count += 1
             else:
                 phrases_in_sent = stress_phrase_sents[i]
                 # print(phrases_in_sent)
                 phrase_combinations = list(nltk_ngrams(
                     phrases_in_sent, 2)) + list(
                         nltk_ngrams(phrases_in_sent, 3))
                 for pc in phrase_combinations:
                     str = "".join(pc)
                     if str == str[::-1]:
                         # print("mirror", str)
                         mirror_count += 1
                     str_length = len(str)
                     first_half = str[:str_length // 2]
                     second_half = str[str_length //
                                       2:] if str_length % 2 == 0 else str[
                                           str_length // 2 + 1:]
                     if first_half == second_half:
                         # print("rp", str)
                         repeated_phrase_count += 1
         repeated.append(repeated_count)
         mirror.append(mirror_count)
         repeated_phrase.append(repeated_phrase_count)
     X = np.array([repeated, mirror, repeated_phrase]).T
     return X
Пример #9
0
    def ngrams(self, n, min_repetitions=2):
        """Returns dictionary of ngrams repeated more than 2 times

        Args:
            n (int): n in ngram

        Returns:
            dict: dictionary of ngrams
        """

        # get the dict
        ngram_dict = dict(Counter(nltk_ngrams(self.filtered_text.split(), n)))

        # remove ngrams with only one ocurrence
        ngram_dict = {
            key: value
            for key, value in ngram_dict.items() if value >= min_repetitions
        }

        return ngram_dict
Пример #10
0
    def semanticize(self, sentence, normalize_dash=True,
                    normalize_accents=True, normalize_lower=False,
                    translations=True, counts=False,
                    sense_probability_threshold=None):
        if sense_probability_threshold == None:
            sense_probability_threshold = self.sense_probability_threshold
        result = {"links": []}
        ngrams = set()
        token_lists = [tokenize(sentence),
                       tokenize(sentence.replace('-', ' ')),
                       tokenize(sentence.replace('.', ' ')),
                       tokenize(sentence.replace('.', ''))]

        # get all ngrams for this sentence, limit to max_ngram_length
        # if applicable
        for token_list in token_lists:
            max_len = len(token_list) + 1
            if self.max_ngram_length is not None:
                max_len = min(max_len, self.max_ngram_length)

            for n in range(1, max_len):
                for ngram in nltk_ngrams(token_list, n):
                    ngrams.add(' '.join(ngram))

        normal_ngrams = map(wpmutil.normalize, ngrams)
        exist = self.wpm.normalized_entities_exist(normal_ngrams)

        for i, (ngram, normal_ngram) in enumerate(zip(ngrams, normal_ngrams)):
            if exist[i]:
                normalized_ngram = wpmutil.normalize(ngram, normalize_dash,
                                                     normalize_accents,
                                                     normalize_lower)
                anchors = self.wpm.get_all_entities(normal_ngram)
                for anchor in anchors:
                    normalized_anchor = wpmutil.normalize(anchor, normalize_dash,
                                                          normalize_accents,
                                                          normalize_lower)
                    if normalized_ngram == normalized_anchor:
                        if self.debug and not self.wpm.entity_exists(anchor):
                            raise LookupError("Data corrupted, cannot "
                                              + "find %s in the database" \
                                              % anchor)
                        entity = self.wpm.get_entity_data(anchor)
                        for sense in entity['senses']:
                            sense_str = str(sense)
                            sense_data = self.wpm.get_sense_data(anchor,
                                                                 sense_str)
                            if sense_data:
                                if entity['cnttextocc'] == 0:
                                    link_probability = 0
                                    sense_probability = 0
                                else:
                                    link_probability = float(entity['cntlinkdoc']) / entity['cnttextdoc']
                                    sense_probability = float(sense_data['cntlinkdoc']) / entity['cnttextdoc']
                                if sense_probability > sense_probability_threshold:
                                    title = unicode(self.wpm.get_item_title(sense_str))
                                    url = self.wikipedia_url_template \
                                          % (self.language_code,
                                             urllib.quote(title.encode('utf-8')))
                                    if entity['cntlinkocc'] == 0:
                                        prior_probability = 0
                                    else:
                                        prior_probability = float(sense_data['cntlinkocc']) / entity['cntlinkocc']
                                    link = {
                                        "label": anchor,
                                        "text": ngram,
                                        "title": title,
                                        "id": sense,
                                        "url": url,
                                        "linkProbability": link_probability,
                                        "senseProbability": sense_probability,
                                        "priorProbability": prior_probability
                                    }
                                    if translations:
                                        link["translations"] = {self.language_code:
                                                                {"title": title,
                                                                 "url": url}}
                                        if self.wpm.sense_has_trnsl(sense_str):
                                            for lang in self.wpm.get_trnsl_langs(sense_str):
                                                trnsl = self.wpm.get_sense_trnsl(sense_str, lang)
                                                link["translations"][lang] = {
                                                    'title': unicode(trnsl),
                                                    'url': self.wikipedia_url_template % (lang, urllib.quote(unicode(trnsl).encode('utf-8')))
                                                }
                                    if counts:
                                        link["occCount"] = entity['cnttextocc']
                                        link["docCount"] = entity['cnttextdoc']
                                        link["linkOccCount"] = entity['cntlinkocc']
                                        link["linkDocCount"] = entity['cntlinkdoc']
                                        link["senseOccCount"] = int(sense_data['cntlinkocc'])
                                        link["senseDocCount"] = int(sense_data['cntlinkdoc'])
                                        link['fromTitle'] = sense_data['from_title']
                                        link['fromRedirect'] = sense_data['from_redir']
                                    result["links"].append(link)

        return result
Пример #11
0
    def semanticize(self,
                    sentence,
                    normalize_dash=True,
                    normalize_accents=True,
                    normalize_lower=False,
                    translations=True,
                    counts=False,
                    sense_probability_threshold=None):
        if sense_probability_threshold == None:
            sense_probability_threshold = self.sense_probability_threshold
        result = {"links": []}
        ngrams = set()
        token_lists = [
            tokenize(sentence),
            tokenize(sentence.replace('-', ' ')),
            tokenize(sentence.replace('.', ' ')),
            tokenize(sentence.replace('.', ''))
        ]

        # get all ngrams for this sentence, limit to max_ngram_length
        # if applicable
        for token_list in token_lists:
            max_len = len(token_list) + 1
            if self.max_ngram_length is not None:
                max_len = min(max_len, self.max_ngram_length)

            for n in range(1, max_len):
                for ngram in nltk_ngrams(token_list, n):
                    ngrams.add(' '.join(ngram))

        normal_ngrams = map(wpmutil.normalize, ngrams)
        exist = self.wpm.normalized_entities_exist(normal_ngrams)

        for i, (ngram, normal_ngram) in enumerate(zip(ngrams, normal_ngrams)):
            if exist[i]:
                normalized_ngram = wpmutil.normalize(ngram, normalize_dash,
                                                     normalize_accents,
                                                     normalize_lower)
                anchors = self.wpm.get_all_entities(normal_ngram)
                for anchor in anchors:
                    normalized_anchor = wpmutil.normalize(
                        anchor, normalize_dash, normalize_accents,
                        normalize_lower)
                    if normalized_ngram == normalized_anchor:
                        if self.debug and not self.wpm.entity_exists(anchor):
                            raise LookupError("Data corrupted, cannot "
                                              + "find %s in the database" \
                                              % anchor)
                        entity = self.wpm.get_entity_data(anchor)
                        for sense in entity['senses']:
                            sense_str = str(sense)
                            sense_data = self.wpm.get_sense_data(
                                anchor, sense_str)
                            if sense_data:
                                if entity['cnttextocc'] == 0:
                                    link_probability = 0
                                    sense_probability = 0
                                else:
                                    link_probability = float(
                                        entity['cntlinkdoc']
                                    ) / entity['cnttextdoc']
                                    sense_probability = float(
                                        sense_data['cntlinkdoc']
                                    ) / entity['cnttextdoc']
                                if sense_probability > sense_probability_threshold:
                                    title = unicode(
                                        self.wpm.get_item_title(sense_str))
                                    url = self.wikipedia_url_template \
                                          % (self.language_code,
                                             urllib.quote(title.encode('utf-8')))
                                    if entity['cntlinkocc'] == 0:
                                        prior_probability = 0
                                    else:
                                        prior_probability = float(
                                            sense_data['cntlinkocc']
                                        ) / entity['cntlinkocc']
                                    link = {
                                        "label": anchor,
                                        "text": ngram,
                                        "title": title,
                                        "id": sense,
                                        "url": url,
                                        "linkProbability": link_probability,
                                        "senseProbability": sense_probability,
                                        "priorProbability": prior_probability
                                    }
                                    if translations:
                                        link["translations"] = {
                                            self.language_code: {
                                                "title": title,
                                                "url": url
                                            }
                                        }
                                        if self.wpm.sense_has_trnsl(sense_str):
                                            for lang in self.wpm.get_trnsl_langs(
                                                    sense_str):
                                                trnsl = self.wpm.get_sense_trnsl(
                                                    sense_str, lang)
                                                link["translations"][lang] = {
                                                    'title':
                                                    unicode(trnsl),
                                                    'url':
                                                    self.wikipedia_url_template
                                                    %
                                                    (lang,
                                                     urllib.quote(
                                                         unicode(trnsl).encode(
                                                             'utf-8')))
                                                }
                                    if counts:
                                        link["occCount"] = entity['cnttextocc']
                                        link["docCount"] = entity['cnttextdoc']
                                        link["linkOccCount"] = entity[
                                            'cntlinkocc']
                                        link["linkDocCount"] = entity[
                                            'cntlinkdoc']
                                        link["senseOccCount"] = int(
                                            sense_data['cntlinkocc'])
                                        link["senseDocCount"] = int(
                                            sense_data['cntlinkdoc'])
                                        link['fromTitle'] = sense_data[
                                            'from_title']
                                        link['fromRedirect'] = sense_data[
                                            'from_redir']
                                    result["links"].append(link)

        return result
Пример #12
0
def get_all_ngrams(n, words):
    """ Generate all possible engrams from a text and enumerate them. """
    return enumerate(nltk_ngrams(words, n))  # not necessarily a list
Пример #13
0
def get_ngrams(unigrams, orders=[1, 2]):
    all_ngrams = itertools.chain(*map(lambda n: list(nltk_ngrams(unigrams, n)), orders))
    return(set(all_ngrams))
Пример #14
0
def get_all_ngrams(n, words):
    """ Generate all possible engrams from a text and enumerate them. """
    return enumerate(nltk_ngrams(words, n))  # not necessarily a list
Пример #15
0
else:
    print("Loading effect list for words")
    f = open("data/effect_list.pkl", "rb")
    effect_list = pickle.load(f)
    f.close()

if not path.exists("data/{}grams.pkl".format(NGRAM)) or args.rebuild:
    print("Building {}-grams".format(NGRAM))

    ngrams = []
    for s in sentences:
        g = list(
            nltk_ngrams(s.split(),
                        NGRAM,
                        pad_left=True,
                        left_pad_symbol="<s>",
                        pad_right=True,
                        right_pad_symbol="</s>"))

        ngrams.extend(g)

    # TODO: Filter out the least frequent words
    ngrams_count = dict(Counter(ngrams).viewitems())
    total_count = sum(ngrams_count.values())
    ngrams = dict([(g, float(ngrams_count[g]) / total_count)
                   for g in ngrams_count.keys()])

    f = open("./data/{}grams.pkl".format(NGRAM), "wb")
    pickle.dump(ngrams, f)
    f.close()