def create_tri_model(span_eng_dict):
    trigram_span_dict = collections.defaultdict(lambda: 0)
    trigram_eng_dict = collections.defaultdict(lambda: 0)
    trigram_span_eng_dict = collections.defaultdict(lambda: tuple)
    text = codecs.open('SpanishText.txt', encoding='utf-8')
    for sentence in text.readlines():
        line = [re.sub('[.?!",]', '', word) for word in sentence.split()]
        for word1, word2, word3 in trigrams(line):
            trigram_span_dict[(word1.lower(), word2.lower(),
                               word3.lower())] += 1
        for word1, word2, word3 in trigrams(line):
            #print(word1, span_eng_dict[word1])
            trigram_span_eng_dict[(
                word1.lower(), word2.lower(),
                word3.lower())] = (span_eng_dict[word1.lower()],
                                   span_eng_dict[word2.lower()],
                                   span_eng_dict[word3.lower()])
    eng_text = open('DMT_output.txt')
    for sentence in eng_text.readlines():
        line = [re.sub('[.?!",]', '', word) for word in sentence.split()]
        for word1, word2, word3 in trigrams(line):
            trigram_eng_dict[(word1.lower(), word2.lower(),
                              word3.lower())] += 1
    text = ''
    for k, v in trigram_span_eng_dict.items():
        try:
            if (trigram_span_dict.get(k) == trigram_eng_dict.get(v)) and (
                    trigram_span_dict.get(k) >= 1):
                #print(k, v)
                text += k[0] + k[1]
        except:
            pass
    return
Exemplo n.º 2
0
def score_by_topic(pkg, scores):
    '''Examines the pkg and adds scores according to topics in it.'''
    themes = Themes.instance()
    for level in range(3):
        pkg_text = package_text(pkg, level)
        words, words_without_stopwords = normalize_text(pkg_text)
        for num_words in (1, 2, 3):
            if num_words == 1:
                ngrams = words_without_stopwords
                topic_ngrams = themes.topic_words
                topic_ngrams_set = themes.topic_words_set
            elif num_words == 2:
                ngrams = bigrams(words)
                topic_ngrams = themes.topic_bigrams
                topic_ngrams_set = themes.topic_bigrams_set
            elif num_words == 3:
                ngrams = trigrams(words)
                topic_ngrams = themes.topic_trigrams
                topic_ngrams_set = themes.topic_trigrams_set
            matching_ngrams = set(ngrams) & topic_ngrams_set
            if matching_ngrams:
                for ngram in matching_ngrams:
                    occurrences = ngrams.count(ngram)
                    score = (3-level) * occurrences * num_words
                    theme = topic_ngrams[ngram]
                    ngram_printable = ' '.join(ngram) if isinstance(ngram, tuple) else ngram
                    reason = '"%s" matched %s' % (ngram_printable, LEVELS[level])
                    if occurrences > 1:
                        reason += ' (%s times)' % occurrences
                    scores[theme].append((score, reason))
                    log.debug(' %s %s %s', theme, score, reason)
Exemplo n.º 3
0
    def act(self):
        """
        Add words in the last observation to the dictionary.

        This checks any fields in the message present in the --dict-textfields
        argument (e.g. "text,labels").
        """
        for textfield in self.textfields:
            source = self.observation.get(textfield)
            if source is None:
                continue
            # fields may be singleton strings or lists of strings.
            # wrap the singleton strings in a list to iterate over them
            if type(source) is str:
                source = [source]
            for text in source:
                if text:
                    tokens = self.tokenize(text)
                    self.add_to_dict(tokens)
                    unigram_ = nltk.ngrams(tokens, 1)
                    bigrams_ = bigrams(tokens)
                    trigrams_ = trigrams(tokens)
                    self.unigram_freq.update(unigram_)
                    self.bigram_freq.update(bigrams_)
                    self.trigram_freq.update(trigrams_)
        return {'id': 'Dictionary'}
Exemplo n.º 4
0
def trigram_format( test_corpus ):
    """
    >>> trigram_format(["the dog runs STOP", "the cat walks STOP", "the dog runs STOP"])
    [[('the', 'dog', 'runs'), ('dog', 'runs', 'STOP')], [('the', 'cat', 'walks'), ('cat', 'walks', 'STOP')], [('the', 'dog', 'runs'), ('dog', 'runs', 'STOP')]]
    """
    wl = [ [word for word in sentence.split()] for sentence in test_corpus] 
    return [ util.trigrams( l ) for l in wl ]
Exemplo n.º 5
0
def generate_unibitrigrams(key_score_file):
    with open(key_score_file, 'rb') as infile:
        infile.readline()
        key_list = list()
        for line in infile:
            row = list(line.split(','))
            key_list.append(row[0])
    uni_bi_trigrams = []
    for phrase in key_list:
        words = []
        unigrams_ls = []
        bigrams_ls = []
        trigrams_ls = []
        for word in nltk.word_tokenize(phrase):
            word = re.sub('[!"#$%&\'\(\)*+,-./:;<=>?@[\]\^_`{|}~]', '', word)
            words.append(word)
        unigrams_ls = words
        #bigrams_ls=list(bigrams(words))

        for x in list(bigrams(words)):
            bigrams_ls.append(x[0] + ' ' + x[1])

        for x in list(trigrams(words)):
            trigrams_ls.append(x[0] + ' ' + x[1] + ' ' + x[2])
        #trigrams_ls=list(trigrams(words))
        uni_bi_trigrams = uni_bi_trigrams + unigrams_ls + bigrams_ls + trigrams_ls
    return uni_bi_trigrams
Exemplo n.º 6
0
    def ngrams(self, gram_size=3):
        """Gives ngrams.

        Returns a list of ngrams, each ngram represented as a tuple.

        Args:
            gram_size (:obj:`int`, optional) Size of the ngrams to generate

        Returns:
            :obj:`list` of :obj:`tuple` Words of each ngram

        Example:
            >>> text = EnglishText('They hated to think of sample sentences.')
            >>> basic_ngrams = text.ngrams()
            >>> print(basic_ngrams)
            [('They', 'hated', 'to'), ('hated', 'to', 'think'), ('to', 'think', 'of'), ('think', 'of', 'sample'), ('of', 'sample', 'sentences'), ('sample', 'sentences', '.')]
        """ # noqa
        tokens = self.tokenize()
        if gram_size < 2:  # pragma: no cover
            gram_size = 2
        if gram_size == 2:  # pragma: no cover
            return list(bigrams(tokens))
        if gram_size == 3:
            return list(trigrams(tokens))
        else:  # pragma: no cover
            return list(ngrams(tokens, gram_size))
Exemplo n.º 7
0
def pos_tags(vocab_hash, sentence):
    sentence = sentence.split()
    unigram_hash = get_pos(vocab_hash, sentence)
    bigram_hash = get_pos(vocab_hash, bigrams(sentence))
    trigram_hash = get_pos(vocab_hash, trigrams(sentence))
    pos_tags = []
    ngram_to_tag = {}
    ngram_ordering = []
    for i in xrange(len(sentence)):
        word = sentence[i]
        if unigram_hash.has_key(word):
            tag = unigram_hash[word]
            pos_tags.append(tag)
            ngram_to_tag[word] = tag
            ngram_ordering.append(word)
        elif i < len(sentence) - 1:
            bigram = sentence[i] + " " + sentence[i + 1]
            if bigram_hash.has_key(bigram):
                tag = bigram_hash[bigram]
                pos_tags.append(tag)
                ngram_to_tag[bigram] = tag
                ngram_ordering.append(bigram)
                i += 1
            elif (i < len(sentence) - 2):
                trigram = " ".join(sentence[i:i + 2])
                if trigram_hash.has_key(trigram):
                    tag = trigram_hash[trigram]
                    pos_tags.append(tag)
                    ngram_to_tag[trigram] = tag
                    ngram_ordering.append(trigram)
                    i += 2
    return pos_tags, ngram_to_tag, ngram_ordering
Exemplo n.º 8
0
 def __init__(self, index: int, sent: str, start: int, end: int):
     self.index = index
     self.sent = sent
     self.words = self.sentToWords()
     self.nGrams = list(trigrams(self.words))
     self.start = start
     self.end = end
Exemplo n.º 9
0
def text():
    testo = [x for x in request.form.values()]
    input_text = testo[0]
    input_text = re.sub(r'\n', '' , input_text)
    input_text = re.sub(r'\t', '' , input_text)
    
    tok_test = word_tokenize(input_text.lower())
    trig = list(trigrams(tok_test))
    
    vocab_file = pd.read_csv('DIZIONARIO.csv', sep=';')
    vocab = list(vocab_file['TOKEN'])
    
    model = pickle.load(open('langmod.pickle', 'rb'))
    
    err = 0 
    
    for t in trig:
        
        if(t[2] not in vocab):
            err +=1
            continue
        if(model.score(t[2],[t[0],t[1]])==0):
            err +=1
    
    if(err<=2):
        punteggio = 'Sembra che non ci siano errori ortografici tipici della disortografia :-)' 
    elif(err>2 and err<=5):
        punteggio = 'Forse c\'è qualche errore ortografico che potrebbe far pensare alla presenza di disortografia. Controlla le doppie, lo scambio, l\'inserimento o la traslazione di lettere/sillabe, l\'H nei verbi e gli accenti!'
        
    elif(err>5):
        punteggio = 'Gli errori ortografici sembrano essere un po\' tantini e tipici della disortografia! Non preoccuparti, respira e - dopo aver controllato altri testi - consulta uno specialista :-)'
    
    return render_template('elements.html',score=punteggio)
Exemplo n.º 10
0
def generate_unibitrigrams(key_score_file):
    with open(key_score_file,'rb') as infile:
        infile.readline()
        key_list=list()
        for line in infile:
            row=list(line.split(','))
            key_list.append(row[0])
    uni_bi_trigrams=[]
    for phrase in key_list:
        words=[]
        unigrams_ls=[]
        bigrams_ls=[]
        trigrams_ls=[]
        for word in nltk.word_tokenize(phrase):
            word=re.sub('[!"#$%&\'\(\)*+,-./:;<=>?@[\]\^_`{|}~]','',word)
            words.append(word)
        unigrams_ls=words
        #bigrams_ls=list(bigrams(words))

        for x in list(bigrams(words)):
            bigrams_ls.append(x[0]+' '+x[1] )


        for x in list(trigrams(words)):
            trigrams_ls.append(x[0]+' '+x[1]+' '+x[2] )
        #trigrams_ls=list(trigrams(words))
        uni_bi_trigrams=uni_bi_trigrams+unigrams_ls+bigrams_ls+trigrams_ls
    return uni_bi_trigrams
Exemplo n.º 11
0
 def __init__(self, index: int, sent: str, start: int, end: int, lang: int):
     self.lang = LangDiff(lang)
     self.index = index
     self.sent = sent
     self.words = self.remove_puncts(self.lang.word_tokenize(sent))
     self.nGrams = Counter(trigrams(self.sent_to_words()))
     self.start = start
     self.end = end
Exemplo n.º 12
0
def pre_trigram(texto):
    lista = []
    for x in texto:
        if len(x) < 4:
            lista.append((x))
        else:
            lista.append(tuple(trigrams(x)))
    return lista
Exemplo n.º 13
0
    def wordsToTrigramsWithIndices(self, dictionary):
        def getIndexedTuple(word: str):
            index = -1
            if word in dictionary.wordsToIndices:
                index = dictionary.wordsToIndices[word]
            return (index, word)

        return list(trigrams(list(map(getIndexedTuple, self.words))))
Exemplo n.º 14
0
def brown_trigrams(category):
    """Takes as input the name of a brown category, and returns a list of all of the trigrams in the category."""
    words = ["<s>"]
    words += [
        word.lower() for word in brown.words(categories=category)
        if word.isalnum()
    ]
    words.append("</s>")
    return list(trigrams(words))
Exemplo n.º 15
0
def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = []
    w = list(trigrams(sentence))

    for x in w:
        sentence_words.append(x[0] + x[1] + x[2])

    # stem each word
    return sentence_words
 def getNgrams(self):
     """Get ngrams from the question. Right now only bigrams and trigrams are supported"""
     bigram_str = [
         bigram[0] + ' ' + bigram[1] for bigram in bigrams(self.tokens)
     ]
     trigram_str = [
         trigram[0] + ' ' + trigram[1] + ' ' + trigram[2]
         for trigram in trigrams(self.tokens)
     ]
     return (bigram_str, trigram_str)
Exemplo n.º 17
0
 def ngrams(self, gram_size=3):
     tokens = self.tokenize()
     if gram_size < 2:  # pragma: no cover
         gram_size = 2
     if gram_size == 2:  # pragma: no cover
         return list(bigrams(tokens))
     if gram_size == 3:
         return list(trigrams(tokens))
     else:  # pragma: no cover
         return list(ngrams(tokens, gram_size))
Exemplo n.º 18
0
def main():
    save_data_from_webpage()
    
    text = get_data_from_file()
  
    
    #creates a list of the tolkenized words
    tt = word_tokenize(text)
    pprint(tt)

    #creates a new list for the steam words using all of the stemmers
    psteam = PorterStemmer()
    psteam_list = []
    for word in tt:
        psteam_list.append(psteam.stem(word))
    pprint(psteam_list)

    lsteam = LancasterStemmer()
    lsteam_list = []
    for word in tt:
       lsteam_list.append(lsteam.stem(word))
    pprint(lsteam_list)

    ssteam = SnowballStemmer()
    ssteam_list = []
    for word in tt:
        ssteam_list.append(ssteam.stem(word))
    pprint(ssteam_list)

    p = set(psteam_list)
    l = set(lsteam_list)
    s = set(ssteam_list)
    #displays the different steams
    pprint(s.difference(l.difference(p)))

    #pos taging
    pos_list = pos_tag(text)
    pprint(pos_list)

    #creates a new list for the lematized words
    lemmatizer = WordNetLemmatizer()
    lem = []
    for word in tt:
        lem.append(lemmatizer.lemmatize(word)) 
    #pprint(lem)
    
    # returns a generator of trigrams using the tokenized list tt
    trig = trigrams(tt)
    displays the results
    print(list(trig))
    
    #ne_chunck finds non overlapping groups
    #pos_tag ids how the text is used in speech
    NamedEntity = ne_chunk(pos_tag(wordpunct_tokenize(text)))
    print(NamedEntity)
Exemplo n.º 19
0
 def _get_filtered_trigrams(self, words):
     # Allow stopword in the middle of trigram
     filtered_trigrams = []
     for tri in trigrams(words):
         leave = True
         for i, w in enumerate(tri):
             if w in stopwords and i != 1:
                 leave = False
                 break
         if leave and tri[0] != tri[1] and tri[1] != tri[2]:
             filtered_trigrams.append(tri)
     return filtered_trigrams
Exemplo n.º 20
0
    def __init__(self, geo_locations):
        '''Initializes the language model by creating the ConditionalFreqDist
        and ConditionalProbDist'''

        words_count = 0

        # will contain all names in a list which preserves their frequenceis as
        # they appear in the gazetteer. The frequenceis are going to be used in
        # the language model.
        gaz_n_grams = list()

        self.unigrams = defaultdict(int)

        for ln in geo_locations:

            number_of_mentions = len(geo_locations[ln])

            n_gram = ln.split()

            new_list = [n_gram] * number_of_mentions

            gaz_n_grams.extend(new_list)

            for token in n_gram:
                words_count += 1
                self.unigrams[token] += 1

        self.unigrams = {"words": self.unigrams, "words_count": words_count}

        # bigrams +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        train_bigrams = list(chain(*[bigrams(i) for i in gaz_n_grams]))

        cfd_bigrams = ConditionalFreqDist()

        for bg in train_bigrams:
            cfd_bigrams[bg[0]][bg[1]] += 1

        # bigrams MLE probabilities
        self.cpd_bigrams = ConditionalProbDist(cfd_bigrams, nltk.MLEProbDist)

        # trigrams ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        train_trigrams = list(chain(*[trigrams(i) for i in gaz_n_grams]))

        cfd_trigrams = ConditionalFreqDist()

        for bg in train_trigrams:

            bi_gr = " ".join(bg[:-1])

            cfd_trigrams[bi_gr][bg[2]] += 1

        # trigrams MLE probabilities
        self.cpd_trigrams = ConditionalProbDist(cfd_trigrams, nltk.MLEProbDist)
Exemplo n.º 21
0
    def _eval_sent_entropy(self, sents, human=''):
        for i in range(len(sents)):
            sent = sents[i]
            sent_tokens = sent.split()
            unigrams_ = nltk.ngrams(sent_tokens, 1)
            bigrams_ = bigrams(sent_tokens)
            trigrams_ = trigrams(sent_tokens)
            prob_unigrams = [
                self.dict.unigram_freq[uni_tok] / self.total_unigrams
                for uni_tok in unigrams_
            ]
            prob_bigrams = [
                self.dict.bigram_freq[bi_tok] / self.total_bigrams
                for bi_tok in bigrams_
            ]
            prob_trigrams = [
                self.dict.trigram_freq[tri_tok] / self.total_trigrams
                for tri_tok in trigrams_
            ]

            # smoothing zero values
            prob_unigrams = np.asarray(
                [p if p > 0 else 1 for p in prob_unigrams])
            prob_bigrams = np.asarray(
                [p if p > 0 else 1 for p in prob_bigrams])
            prob_trigrams = np.asarray(
                [p if p > 0 else 1 for p in prob_trigrams])

            sent_entropy_uni = -np.sum(np.log2(prob_unigrams))
            sent_entropy_bi = -np.sum(np.log2(prob_bigrams))
            sent_entropy_tri = -np.sum(np.log2(prob_trigrams))

            word_entropy_uni = sent_entropy_uni / (len(prob_unigrams) +
                                                   sys.float_info.epsilon)
            word_entropy_bi = sent_entropy_bi / (len(prob_bigrams) +
                                                 sys.float_info.epsilon)
            word_entropy_tri = sent_entropy_tri / (len(prob_trigrams) +
                                                   sys.float_info.epsilon)

            self.metrics[human + 'sent_entropy_uni_cnt'] += 1
            self.metrics[human + 'sent_entropy_uni'] += sent_entropy_uni
            self.metrics[human + 'sent_entropy_bi_cnt'] += 1
            self.metrics[human + 'sent_entropy_bi'] += sent_entropy_bi
            self.metrics[human + 'sent_entropy_tri_cnt'] += 1
            self.metrics[human + 'sent_entropy_tri'] += sent_entropy_tri

            self.metrics[human + 'word_entropy_uni_cnt'] += 1
            self.metrics[human + 'word_entropy_uni'] += word_entropy_uni
            self.metrics[human + 'word_entropy_bi_cnt'] += 1
            self.metrics[human + 'word_entropy_bi'] += word_entropy_bi
            self.metrics[human + 'word_entropy_tri_cnt'] += 1
            self.metrics[human + 'word_entropy_tri'] += word_entropy_tri
Exemplo n.º 22
0
 def _create_objects_interface(self):
     formated_objects_interface = []
     specification = []
     for sent in self._exchange_states():
          for chunk in sent:
              if chunk[0] == 'specification':
                  specification.append(trigrams([chunk[1], chunk[-2], \
                                     self._convert_to_yakindu_type(type(chunk[-1]).__name__)]))
     default_specification = list(OrderedSet(chain(*specification)))
     objects_specification = modified_groupby(default_specification, key=lambda obj: obj[0])
     for obj, specification_chunks in objects_specification.items():
         formated_objects_interface.append('\n\ninterface ' + obj + ':')
         for chunk in specification_chunks:
              formated_objects_interface.append('\nvar ' + chunk[-2] + ':' + chunk[-1])               
     return ''.join(formated_objects_interface)
    def __call__(self, t):
        t = self.reduce_lengthening(t)
        tokens = t.split(' ')

        cleaned_tokens = []
        for token in tokens:
            token = self.replace_username(token)
            token = self.replace_link(token)
            cleaned_tokens.append(token)

        rebuild_str = ' '.join(cleaned_tokens)

        negated_tokens = mark_negation(list(self.tknzr.tokenize(rebuild_str)))
        list_of_trigrams = list([' '.join(s) for s in trigrams(negated_tokens)])
        return list_of_trigrams
Exemplo n.º 24
0
def trigram_plot(l):
    list_trigrams = list(trigrams(l))
    dictionary_trigram = {}
    for i in range(len(list_trigrams)):
        dictionary_trigram[list_trigrams[i]] = 0
    for i in range(len(list_trigrams)):
        dictionary_trigram[list_trigrams[i]] += 1
    plus = 0
    for i in dictionary_trigram.values():
        plus += i
    print("total unique trigram are:", end="")
    print(plus)

    dictionary_trigram = dict(
        sorted(dictionary_trigram.items(), key=lambda x: x[1], reverse=True))
    count = 0
    pdf = 0.0
    for key, i in dictionary_trigram.items():
        pdf += i / plus
        count += 1
        if (pdf > 0.7):
            break
    print(
        "total trigrams are required to cover the 70% of the complete corpus:",
        end="")
    print(count)
    print(pdf)

    threshold = 45
    for key in dictionary_trigram.copy():
        if (dictionary_trigram[key] < threshold):
            dictionary_trigram.pop(key)
    keys_trigram, values_trigram = dictionary_trigram.keys(
    ), dictionary_trigram.values()
    keys_trigram = list(keys_trigram)
    ls = []
    for i in keys_trigram:
        t = ' '.join(i)
        ls.append(t)
    print("total trigrams taken for plotting purpose:", end="")
    print(len(ls))
    plt.loglog(tuple(ls), tuple(values_trigram), color='g')
    plt.xticks(range(len(ls)), ls, rotation=90)
    plt.xlabel('trigram')
    plt.ylabel('trigram count')
    plt.xscale('log')
    plt.show()
    plt.savefig('trigram')
    def __call__(self, t):
        t = self.reduce_lengthening(t)
        tokens = t.split(' ')

        cleaned_tokens = []
        for token in tokens:
            token = self.replace_username(token)
            token = self.replace_link(token)
            cleaned_tokens.append(token)

        rebuild_str = ' '.join(cleaned_tokens)

        negated_tokens = mark_negation(list(self.tknzr.tokenize(rebuild_str)))
        list_of_trigrams = list(
            [' '.join(s) for s in trigrams(negated_tokens)])
        return list_of_trigrams
Exemplo n.º 26
0
def getTrigramsDistributionFromText(txt):
    trigrm = list(trigrams(txt.split()))
    # print(bigrm)
    trigramWords = ', '.join(' '.join((a, b, c)) for a, b, c in trigrm)

    dictResTri = {}
    for tri in trigramWords.split(","):
        tri = tri.lstrip()
        tri = tri.rstrip()
        if tri in dictTrigrams:
            # print(bi,dictBigrams[bi])
            dictResTri[tri] = dictTrigrams[tri]
        # else:
        #     # print("NA")
        #     dictResBi[bi]= 0

    return (sorted(dictResTri.items(), key=lambda x: x[1], reverse=True))
def write_trigrams(words, name, minlength, count):
    print("Finding " + name)
    stopfilter = lambda w: len(
        w) < minlength or w in stop_numbers + stop_common + ['e', 'i']

    collocs = Counter(trigrams(words))
    collocs = Counter({
        key: val
        for key, val in collocs.items() if not stopfilter(key[0])
        and not stopfilter(key[1]) and not stopfilter(key[2])
    })
    collocs = collocs.most_common(count)

    f = open(name + '.csv', 'w', encoding="utf-8")
    for word, val in collocs:
        f.write(u'{},{},{},{}\n'.format(word[0], word[1], word[2], val))
    f.close
Exemplo n.º 28
0
    def tuple_to_ngrams(tuple_with_words: tuple[str],
                        n=2) -> tuple[tuple[str]]:
        """
        Take a tuple with words and convert this tuple to a tuple with n grams (bi-/trigrams)
        :param tuple_with_words:
        :param n:
        :return:
        """

        # make bigrams and trigrams and store them in dictionary
        nGrams: dict[int, tuple[tuple]] = {
            2: tuple(bigrams(tuple_with_words)),
            3: tuple(trigrams(tuple_with_words))
        }

        # return a tuple that contains tuples of size n
        return nGrams[n]
Exemplo n.º 29
0
 def processReview_trigram(self,review):
     review_text = self.stage2.removePunctuations(review["review"])
     if self.const.GENERATE_TRIGRAMS_WITH_STOP_WORDS:
         pass
     else:
         review_text = self.removeStopWordsFromReview(review_text)
     review_text = review_text.lower()
     tokens = review_text.split(" ")
     trigram_list = trigrams(tokens)
     lst =[]
     for trigram in trigram_list:# if self.string_found(bigram)]
         first_word = trigram[0].strip()
         second_word = trigram[1].strip()
         third_word = trigram[2].strip()
         if ""==first_word or ""==second_word or ""==third_word:
             pass
         else:
             lst.append({"word":first_word+" "+second_word+" "+third_word})
     return lst
Exemplo n.º 30
0
    def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
Exemplo n.º 31
0
    def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
def get_trigram_word_dict(emotion_line):
    words = nltk.word_tokenize(emotion_line)
    word_bigrams = list(bigrams(words))
    word_trigrams = list(trigrams(words))

    word_feats = {}
    for w in words:
        if w not in word_feats:
            word_feats[w] = "feature_word"

    for w in word_bigrams:
        if w not in word_feats:
            word_feats[w] = "feature_word"

    for w in word_trigrams:
        if w not in word_feats:
            word_feats[w] = "feature_word"

    return word_feats
Exemplo n.º 33
0
    def buildGraphSentence(self, sentence):
        nodes = list()
        #unograms
        possible_unograms = sentence
        possible_unograms = [
            uno for uno in possible_unograms
            if uno.lower() not in self.stoplist
        ]
        nodes = nodes + possible_unograms
        #bigrams
        possible_bigrams = list(bigrams(sentence))
        possible_bigrams = [
            bi for bi in possible_bigrams
            if (bi[0].lower() not in self.stoplist
                and bi[1].lower() not in self.stoplist)
        ]
        possible_bigrams = [' '.join(bi) for bi in possible_bigrams]
        nodes = nodes + possible_bigrams
        #trigrams
        possible_trigrams = list(trigrams(sentence))
        possible_trigrams = [
            tri for tri in possible_trigrams
            if (tri[0].lower() not in self.stoplist and tri[1].lower() not in
                self.stoplist and tri[2].lower() not in self.stoplist)
        ]
        possible_trigrams = [' '.join(tri) for tri in possible_trigrams]
        nodes = nodes + possible_trigrams
        #print(nodes)

        #add nodes
        for node in nodes:
            self.graph.add_node(node)
        #print(self.graph.nodes)
        #add edges
        for node in nodes:
            for node2 in nodes:
                if node != node2:
                    if self.graph.has_edge(node, node2):
                        if self.NofCooc:
                            self.graph[node][node2]['weight'] += 1
                    else:
                        self.graph.add_edge(node, node2)
                        self.graph[node][node2]['weight'] = 1
Exemplo n.º 34
0
    def __init__(self, work_dir):

        tokens = []
        sentences = []
        paragraphs = []
        paragraph_sentence_length = []
        first_word = []

        for path_to_text, _, text_file_names in os.walk(work_dir):
            for text_file_name in text_file_names:
                text_file = file(os.path.join(path_to_text, text_file_name))

                if not os.path.isfile(os.path.join(path_to_text, text_file_name)):
                    continue
                file_content = text_file.read().decode("utf8")
                print text_file_name

                text_paragraphs = nltk.blankline_tokenize(file_content)
                paragraphs += text_paragraphs

                self._sent_tokenizer = nltk.tokenize.PunktSentenceTokenizer(file_content)

                for paragraph in text_paragraphs:
                    paragraph_sentence = self._sent_tokenizer.tokenize(paragraph)
                    paragraph_sentence_length.append(len(paragraph_sentence))
                    sentences += paragraph_sentence
                    for sentence in paragraph_sentence:
                        sentence_tokens = nltk.word_tokenize(sentence)
                        tokens += sentence_tokens
                        first_word.append(sentence_tokens[0])

        self._trigram_pd = nltk.ConditionalProbDist(
            nltk.ConditionalFreqDist([(t[:2], t[2]) for t in trigrams(tokens)]), nltk.probability.ELEProbDist
        )

        self._bigram_pd = nltk.ConditionalProbDist(
            nltk.ConditionalFreqDist([(t[:1], t[1]) for t in bigrams(tokens)]), nltk.probability.ELEProbDist
        )

        self._sent_begin_pd = nltk.ELEProbDist(nltk.FreqDist(first_word))

        self._paragraph_length_pd = nltk.ELEProbDist(nltk.FreqDist(paragraph_sentence_length))
Exemplo n.º 35
0
def get_letters():

    data_rinat = open('letters_rinat.json')
    data_vladimir = open('letters_vladimir.json')

    root_data = json.loads(data_rinat.readlines()[0]) + json.loads(
        data_vladimir.readlines()[0])
    # print (data)
    root_letters = list()
    root_trigrams = dict()
    for i in root_data:
        # current_index = i.index(root_data)
        get_clean_text = (re.split('\W+',
                                   ((i['coverLetter']))))  #очистить текст
        # temp.append(list(trigrams(get_clean_text)))                #составить триграммы и добавить их в список
        # print(temp)

        for grams in list(trigrams(get_clean_text)):
            root_letters.append(' '.join(grams))

    return collections.Counter(root_letters)
Exemplo n.º 36
0
 def _create_states_specification(self):
     states = []
     specification = []
     states_specification_content = []
     for sent in self._exchange_states():
         specification.append([list(chain(*trigrams([chunk[1] + '{0}', chunk[-2] + \
                             ' {1} ', str(chunk[-1]).lower() + '{2}{3}']))) \
                             for chunk in sent if chunk[0] == 'specification'])
     while [] in specification:
         specification.remove([])
     for spec in specification:
         spec[0].insert(0, 'entry/\n')
         spec[-1][-1] = spec[-1][-1].rstrip('{2}{3}')
     states_specification = dict(izip(self._get_states_content(), specification))
     for state, specification in states_specification.items():
         states_specification[state] = list(chain(*specification))
         states_specification[state].insert(0, '"')
         states_specification[state].append('"')
     for state, specification in states_specification.items():
         states_specification[state] = ''.join(specification).format('.', '=', ';', '\n')
     return states_specification
Exemplo n.º 37
0
    def findBM25Terms(self):
        allterms = set()
        for document in self.dswa:
            for sentence in document:
                nodes = list()
                #unograms
                possible_unograms = sentence
                possible_unograms = [
                    uno for uno in possible_unograms
                    if uno.lower() not in self.stoplist
                ]
                nodes = nodes + possible_unograms
                #bigrams
                possible_bigrams = list(bigrams(sentence))
                possible_bigrams = [
                    bi for bi in possible_bigrams
                    if (bi[0].lower() not in self.stoplist
                        and bi[1].lower() not in self.stoplist)
                ]
                possible_bigrams = [' '.join(bi) for bi in possible_bigrams]
                nodes = nodes + possible_bigrams
                #trigrams
                possible_trigrams = list(trigrams(sentence))
                possible_trigrams = [
                    tri for tri in possible_trigrams
                    if (tri[0].lower() not in self.stoplist
                        and tri[1].lower() not in self.stoplist
                        and tri[2].lower() not in self.stoplist)
                ]
                possible_trigrams = [
                    ' '.join(tri) for tri in possible_trigrams
                ]
                nodes = nodes + possible_trigrams
                #print(nodes)

                #add nodes
                for node in nodes:
                    allterms.add(node)

        return BM25Calculator(self.dswa, allterms)
Exemplo n.º 38
0
def get_keywords(sentence, allowed_tags):
    sentence = _remove_by_regex(_replace_punct(sentence))
    tokens = nltk.word_tokenize(sentence)
    tokens = [token.strip("'") for token in tokens]
    tagged_tokens = nltk.pos_tag(tokens)
    stop_words = get_stop_words('en')
    stop_words = {word.decode('utf-8') for word in stop_words}
    stop_words |= {'read'}
    keywords = []
    for word, tag in tagged_tokens:
        word = word.lower()
        if is_proper_keyword(word, tag, allowed_tags, stop_words):
            keywords.append(word)
    bigrams_keywords = list(bigrams(keywords))
    trigrams_keywords = list(trigrams(keywords))

    for k in bigrams_keywords:
        keywords.append(' '.join(k))

    for k in trigrams_keywords:
        keywords.append(' '.join(k))
    return keywords
Exemplo n.º 39
0
    def value_for_text(self, t, rp=default_rp):
        syntax_trees = rp.parse_trees(t)

        sentence_indices = []
        for tree in syntax_trees:
            if tree.label() == 'ROOT':
                tree = tree[0]

            leaves = tree.leaves()

            word_indices = [0] * len(leaves)
            for i in range(len(leaves)):
                ref_vector = tree.leaf_treeposition(i)

                j = -2
                while j >= -len(ref_vector) and ref_vector[j] == 0:
                    parent_index = len(ref_vector) + j
                    parent_node = tree[ref_vector[:parent_index]]

                    if rp.parser().tagset.is_sentence_node(parent_node):
                        word_indices[i] += 1.5
                    else:
                        word_indices[i] += 1

                    j -= 1

            if len(leaves) < 3:
                sentence_index = sum(word_indices)
            else:
                max_trigrams = 0
                for trigram in trigrams(word_indices):
                    if sum(trigram) > max_trigrams:
                        max_trigrams = sum(trigram)
                sentence_index = max_trigrams

            sentence_indices.append(sentence_index)

        return sum(sentence_indices) / len(sentence_indices) \
                if sentence_indices else 0
Exemplo n.º 40
0
 def filterPhrases(self):
     self.filteredPhrases = []
     self.sentences = sent_tokenize(self.text)
     for sentence in self.sentences:
         bigramList = list(set(bigrams(wordpunct_tokenize(sentence.lower()))))
         for bigram in bigramList:
             (word1, word2) = bigram
             if word1 == "'":
                 term = word1 + word2
             elif re.match(r'\W+', word1) == None and re.match(r'\W+', word2) == None:
                 term = word1 + ' ' + word2
             else:
                 continue
             if self.dbc.execute("SELECT 1 FROM vocabulary WHERE term = %s LIMIT 1;", term) == 0L:
                 if self.udq.recorded(term):
                     if len(term) <= 140:
                         self.dbc.execute("INSERT INTO vocabulary (term) VALUE (%s)", term)
                 else:
                     self.filteredPhrases.append(term)
         trigramList = list(set(trigrams(wordpunct_tokenize(sentence.lower()))))
         for trigram in trigramList:
             (word1, word2, word3) = trigram
             if word3 == "'":
                 continue
             elif word2 == "'":
                 term = word1 + word2 + word3
             elif re.match(r'\W+', word1) == None and re.match(r'\W+', word2) == None and re.match(r'\W+', word3) == None:
                 term = word1 + ' ' + word2 + ' ' + word3
             else:
                 continue
             if self.dbc.execute("SELECT 1 FROM vocabulary WHERE term = %s LIMIT 1;", term) == 0L:
                 if self.udq.recorded(term):
                     if len(term) <= 140:
                         self.dbc.execute("INSERT INTO vocabulary (term) VALUE (%s)", term)
                 else:
                     self.filteredPhrases.append(term)
     return self.filteredPhrases
 def ngrams(self, value):
     for trigram in trigrams(self.tokenize(value)):
         yield trigram
Exemplo n.º 42
0
def getRecipeInfo(myURL):

	### Here the webpage with the recipe is opened ###
	driver = webdriver.Chrome('./chromedriver')
	# driver = webdriver.Firefox()
	# myURL = sys.argv[1]	#'http://allrecipes.com/Recipe/Beef-Brisket-My-Way/'
	#print myURL

	try:
		driver.get(myURL)

		### Here the recipe name is extracted ###

		recipeNameXPath = '//div[@class="detail-right fl-right"]/h1[@id="itemTitle"]'
		recipeNameObject = driver.find_elements_by_xpath(recipeNameXPath)

		for value in recipeNameObject:
			recipeName = value.get_attribute("innerHTML")
		#print recipeName

		ingredients = []
		singleIngredient = {}
		ingredientSet1NamesXPath = '//div[@class="ingred-left"]/ul[@class="ingredient-wrap"]/li[@id="liIngredient"]/label/p[@class="fl-ing"]/span[@id="lblIngName"]'
		ingredientSet1NamesObjects = driver.find_elements_by_xpath(ingredientSet1NamesXPath)

		ingredientSet1AmountsXPath = '//div[@class="ingred-left"]/ul[@class="ingredient-wrap"]/li[@id="liIngredient"]/label/p[@class="fl-ing"]/span[@id="lblIngAmount"]'
		ingredientSet1AmountsObjects = driver.find_elements_by_xpath(ingredientSet1AmountsXPath)

	except:
		driver.quit()
		raise

	for value in ingredientSet1NamesObjects:
		fullSingleIngredient = str(value.get_attribute("innerHTML"))
		if string.find(fullSingleIngredient, ', ') > -1:
			singleIngredientParts = string.split(fullSingleIngredient, ', ')
			singleIngredient['name'] = singleIngredientParts[0]
			singleIngredient['descriptor'] = singleIngredientParts[1]
		else:
			singleIngredient['name'] = fullSingleIngredient
			singleIngredient['descriptor'] = ''
		singleIngredient['preparation'] = ''
		ingredients.append(singleIngredient)
		singleIngredient = {}
	i = 0
	for value in ingredientSet1AmountsObjects:
		amount = str(value.get_attribute("innerHTML"))
		if string.find(amount, '(') > -1:
			actualAmount = string.split(amount, '(')
			amount = string.split(actualAmount[1], ')')
			amount = amount[0]
			#print actualAmount
		qty = re.search(r"[a-z]+", amount)
		if qty != None:
			#print qty.group(0)
			ingredients[i]['measurement'] = qty.group(0)
			myQty = string.replace(amount, str(qty.group(0)), '')
			myQty = myQty.strip()
			if string.find(myQty, '/') > -1:
				qtyNum = string.split(myQty, '/')
				if string.find(qtyNum[0], ' ') > -1:
					numerator = string.split(qtyNum[0], ' ')
					ingredients[i]['quantity'] = (float(numerator[0])*float(qtyNum[1])+float(numerator[1]))/float(qtyNum[1])
				else:
					ingredients[i]['quantity'] = float(qtyNum[0])/float(qtyNum[1])
			else:
				ingredients[i]['quantity'] = myQty
		else:
			ingredients[i]['measurement'] = 'unit'
			ingredients[i]['quantity'] = str(value.get_attribute("innerHTML"))
		i += 1

	ingredientSet2NamesXPath = '//div[@class="ingred-left"]/ul[@class="ingredient-wrap secondColumn"]/li[@id="liIngredient"]/label/p[@class="fl-ing"]/span[@id="lblIngName"]'
	ingredientSet2NamesObjects = driver.find_elements_by_xpath(ingredientSet2NamesXPath)

	ingredientSet2AmountsXPath = '//div[@class="ingred-left"]/ul[@class="ingredient-wrap secondColumn"]/li[@id="liIngredient"]/label/p[@class="fl-ing"]/span[@id="lblIngAmount"]'
	ingredientSet2AmountsObjects = driver.find_elements_by_xpath(ingredientSet2AmountsXPath)

	for value in ingredientSet2NamesObjects:
		fullSingleIngredient = str(value.get_attribute("innerHTML"))
		if string.find(fullSingleIngredient, ', ') > -1:
			singleIngredientParts = string.split(fullSingleIngredient, ', ')
			singleIngredient['name'] = singleIngredientParts[0]
			singleIngredient['descriptor'] = singleIngredientParts[1]
		else:
			singleIngredient['name'] = fullSingleIngredient
			singleIngredient['descriptor'] = ''
		singleIngredient['preparation'] = ''
		ingredients.append(singleIngredient)
		singleIngredient = {}

	for value in ingredientSet2AmountsObjects:
		amount = str(value.get_attribute("innerHTML"))
		if string.find(amount, '(') > -1:
			actualAmount = string.split(amount, '(')
			amount = string.split(actualAmount[1], ')')
			amount = amount[0]
			#print amount
		qty = re.search(r"[a-z]+", amount)
		if qty != None:
			#print qty.group(0)
			ingredients[i]['measurement'] = qty.group(0)
			myQty = string.replace(amount, str(qty.group(0)), '')
			myQty = myQty.strip()
			if string.find(myQty, '/') > -1:
				qtyNum = string.split(myQty, '/')
				if string.find(qtyNum[0], ' ') > -1:
					numerator = string.split(qtyNum[0], ' ')
					ingredients[i]['quantity'] = (float(numerator[0])*float(qtyNum[1])+float(numerator[1]))/float(qtyNum[1])
				else:
					ingredients[i]['quantity'] = float(qtyNum[0])/float(qtyNum[1])
			else:
				ingredients[i]['quantity'] = myQty
		else:
			ingredients[i]['measurement'] = 'unit'
			ingredients[i]['quantity'] = str(value.get_attribute("innerHTML"))
		i += 1

	#pprint(ingredients)

	directions = []
	i = 0

	directionsXPath = '//div[@class="directLeft"]/ol/li/span'
	directionsObjects = driver.find_elements_by_xpath(directionsXPath)

	for value in directionsObjects:
		directions.append(str(value.get_attribute("innerHTML")))
		i += 1
	#print directions

	driver.quit()

	cookingMethods = {}
	with open('./text_files/cookingMethods.txt', 'r') as f:
		for line in f:
			cookingMethods[string.replace(line, '\n', '').strip()] = True
	#pprint(cookingMethods)
	cookingUtensils = {}
	with open('./text_files/cookingUtensils.txt', 'r') as f:
		for line in f:
			cookingUtensils[string.replace(line, '\n', '').strip()] = True
	#pprint(cookingUtensils)
	recipeCookingMethods = []
	recipeCookingUtensils = []

	localPhrase = ''
	for step in directions:
		for phrase in ngrams(string.split(step), 4):
			for word in phrase:
				localPhrase += word
				localPhrase += ' '
			localPhrase = localPhrase.strip()
			localPhrase = localPhrase.replace(',', '')
			localPhrase = localPhrase.replace('.', '')
			for tool in cookingUtensils.keys():
				if tool.lower() == localPhrase.lower():
					#print localToolPhrase, 'utensil ->', tool
					recipeCookingUtensils.append(tool)
			localPhrase = ''
		#print '4-grams done'
		for phrase in trigrams(string.split(step)):
			for word in phrase:
				localPhrase += word
				localPhrase += ' '
			localPhrase = localPhrase.strip()
			localPhrase = localPhrase.replace(',', '')
			localPhrase = localPhrase.replace('.', '')
			for tool in cookingUtensils.keys():
				if tool.lower() == localPhrase.lower():
					#print localToolPhrase, 'utensil ->', tool
					#flag = 1
					#for myTool in recipeCookingUtensils:
					#	if string.find(myTool, localPhrase) > -1:
					#		flag = 0
					#if flag == 1:
					recipeCookingUtensils.append(tool)
			for method in cookingMethods.keys():
				if method.lower() == localPhrase.lower():
					recipeCookingMethods.append(method)
			localPhrase = ''
		#print '3-grams done'
		for phrase in bigrams(string.split(step)):
			for word in phrase:
				localPhrase += word
				localPhrase += ' '
			localPhrase = localPhrase.strip()
			localPhrase = localPhrase.replace(',', '')
			localPhrase = localPhrase.replace('.', '')
			for tool in cookingUtensils.keys():
				if tool.lower() == localPhrase.lower():
					#print localPhrase, 'utensil ->', tool
					#flag = 1
					#for myTool in recipeCookingUtensils:
					#	if string.find(myTool, localPhrase) > -1:
					#		flag = 0
					#if flag == 1:
					recipeCookingUtensils.append(tool)
			for method in cookingMethods.keys():
				if method.lower() == localPhrase.lower():
					#flag = 1
					#for myMethod in recipeCookingMethods:
					#	if string.find(myMethod, localPhrase) > -1:
					#		flag = 0
					#if flag == 1:
					recipeCookingMethods.append(method)
			localPhrase = ''
		#print '2-grams done'
		for word in string.split(step, ' '):
			#print word
			if len(word) > 2:
				word = word.replace(',', '')
				word = word.replace('.', '')
				for method in cookingMethods.keys():
					#if string.find(method, word) > -1:
					if method.lower() == word.lower():
						#flag = 1
						#for myMethod in recipeCookingMethods:
						#	if string.find(myMethod, localPhrase) > -1:
						#		flag = 0
						#if flag == 1:
						recipeCookingMethods.append(method)
				for tool in cookingUtensils.keys():
					#if string.find(tool, word) > -1:
					if tool.lower() == word.lower():
						#print localPhrase, 'utensil ->', tool
						#flag = 1
						#for myTool in recipeCookingUtensils:
						#	if string.find(myTool, localPhrase) > -1:
						#		flag = 0
						#if flag == 1:
						recipeCookingUtensils.append(tool)
		#print '1-grams done'

	utensilsSet = set(recipeCookingUtensils)
	recipeCookingUtensils = list(utensilsSet)
	cookingMethodsSet = set(recipeCookingMethods)
	recipeCookingMethods = list(cookingMethodsSet)

	recipe = {}
	recipe['ingredients'] = ingredients
	recipe['cooking method'] = random.choice(recipeCookingMethods)
	recipe['cooking tools'] = recipeCookingUtensils

	#pprint(recipe)

	myInternalRecipe = {}
	myInternalRecipe['name'] = str(recipeName)
	myInternalRecipe['ingredients'] = []
	for item in ingredients:
		myInternalRecipe['ingredients'].append(item['name'])

	#print myInternalRecipe

	f = open('recipeJson.json', 'w')
	jobj = json.dumps(recipe)
	f.write(jobj)
	f.close()
	with open('recipeJson.json', 'r') as f:
		myJobj = map(json.loads, f)

	return myInternalRecipe, recipe
Exemplo n.º 43
0
text = gutenberg.raw('austen-emma.txt');
nltk_sents = sent_tokenize(text)						# contains the list of sentences detected from the tool
nltk_words = word_tokenize(text)
#print len(nltk_words)

#fnltk_words = [wordnet_lemmatizer.lemmatize(nltk_word) for nltk_word in nltk_words]
#print len(fnltk_words)
dictn=list(set(nltk_words))
tokens = nltk_words

tokens = [token.lower() for token in tokens if len(token) > 1] 		# same as unigrams
lemma_tokens = [wordnet_lemmatizer.lemmatize(token, wordnet.VERB) for token in tokens]
#print len(lemma_tokens)
tokens = lemma_tokens
bi_tokens = list(bigrams(tokens))					# getting the bigrams
tri_tokens = list(trigrams(tokens))


uni_fdist = nltk.FreqDist(tokens)

bi_fdist = nltk.FreqDist(bi_tokens)
#print len(bi_fdist)
tri_fdist = nltk.FreqDist(tri_tokens)

tri_freq = 0
bi_freq = 0
uni_freq = 0



print "top 15 unigrams with lemma\n\n"
Exemplo n.º 44
0
				cw=unig[w]
			except:
				cw=0
			cw+=1
			unig[w]=cw
			
		bg=bigrams(words)
		for b in bg:
			try:
				cb=big[b]
			except:
				cb=0
			cb+=1
			big[b]=cb
			
		tg=trigrams(words)
		for t in tg:
			try:
				ct=trig[t]
			except:
				ct=0
			ct+=1
			trig[t]=ct
f.close()

msim=[]
slens={}

for k in models.keys():
	m=models[k]
	uv=compare_histogram(m["unigrams"], unig)
Exemplo n.º 45
0
def getStats2(training_set, test_set, data_type):
    #List of ngrams for training list
    training_unigram = [unigram for sent in training_set for unigram in sent]
    training_bigram = [bigram for sent in training_set for bigram in list(bigrams(sent))]
    training_trigram = [trigram for sent in training_set for trigram in list(trigrams(sent))]
    #List of ngrams for test list
    test_unigram = [unigram for sent in test_set for unigram in sent]
    test_bigram = [bigram for sent in test_set for bigram in list(bigrams(sent))]
    test_trigram = [trigram for sent in test_set for trigram in list(trigrams(sent))]
    #FreqDist for each ngram for training list
    fdist_training_unigram = FreqDist(training_unigram)
    fdist_training_bigram =FreqDist(training_bigram)
    fdist_training_trigram = FreqDist(training_trigram)
    #freqDistfor each ngram for  test list 
    fdist_test_unigram = FreqDist(test_unigram)
    fdist_test_bigram =FreqDist(test_bigram)
    fdist_test_trigram = FreqDist(test_trigram)
    #Type freq for ngrams in training list
    training_unigram_freq = fdist_training_unigram.N()
    training_bigram_freq = fdist_training_bigram.N()
    training_trigram_freq = fdist_training_trigram.N()
    #Type freq for ngrams in test list
    test_unigram_freq = fdist_test_unigram.N()
    test_bigram_freq = fdist_test_bigram.N()
    test_trigram_freq = fdist_test_trigram.N()
    #Types for ngrams in training list
    types_training_unigram = set(training_unigram)
    types_training_bigram = set(training_bigram)
    types_training_trigram = set(training_trigram)
    #Types for ngrams in test list
    types_test_unigram = set(test_unigram)
    types_test_bigram = set(test_bigram)
    types_test_trigram = set(test_trigram)
    #types of ngrams not in test set that not in training set
    unigrams_not_in_tr = types_test_unigram - types_training_unigram
    bigrams_not_in_tr = types_test_bigram - types_training_bigram
    trigrams_not_in_tr = types_test_trigram - types_training_trigram
    perecent_unigrams_not_in_tr  = 100 * len(unigrams_not_in_tr)/float(len(types_test_unigram)) 
    perecent_bigrams_not_in_tr  = 100 *len(bigrams_not_in_tr)/float(len(types_test_bigram)) 
    perecent_trigrams_not_in_tr  = 100 *len(trigrams_not_in_tr)/float(len(types_test_trigram)) 

    overall_percent_unigrams = 100 *len(unigrams_not_in_tr)/ float(test_unigram_freq)
    overall_percent_bigrams = 100 * len(bigrams_not_in_tr)/ float(test_bigram_freq )
    overall_percent_trigrams = 100* len(trigrams_not_in_tr)/ float(test_trigram_freq)

    print("""%s\nnum types in unigram training set:%s\n
num types in bigram training set:%s\nnum types in trigram training set: %s """ % (data_type,len(types_training_unigram), \
                                                   len(types_training_bigram), len(types_training_trigram) ))
    print ("""\n%s\nnum types in unigram test set: %s\n
num types in bigram test set:%s\nnum types in trigram test set: %s """ % (data_type,len(types_test_unigram),\
                                                                          len(types_test_bigram), len(types_test_trigram) ))

    print ("""\n%s\nunigrams not in training set: %s\n
bigram not in tr set%s\ntrigrams not in training set: %s """ % (data_type,len(unigrams_not_in_tr),\
                                                                          len(bigrams_not_in_tr), len(trigrams_not_in_tr) ))
    print ("""\n%s\percent unigrams: %s\n
percent bigrams:%s\n percent trigrams: %s """ % (data_type, perecent_unigrams_not_in_tr ,\
                                                                          perecent_bigrams_not_in_tr , perecent_trigrams_not_in_tr))
    print ("""\n%s\noverall percent unigrams: %s\n
overall percent bigrams:%s\noverall percent trigrams: %s """ % (data_type,overall_percent_unigrams,\
                                                                          overall_percent_bigrams, overall_percent_trigrams))
Exemplo n.º 46
0
def brown_trigrams(category):
    """Takes as input the name of a brown category, and returns a list of all of the trigrams in the category."""
    words = ["<s>"]
    words += [word.lower() for word in brown.words(categories=category) if word.isalnum()]
    words.append("</s>")
    return list(trigrams(words))
Exemplo n.º 47
0
def extract(featureList, dir, fileout, n):
    tokenizer = RegexpTokenizer(r'\w+')
    docPos = {}
    docNeg = {}
    docFeatures = {}

    sentiment = "pos"
    for file in os.listdir(dir+sentiment):
        if file.endswith(".txt"):
            features = {}
            sentiment = "pos"
            fp = open(dir+sentiment+"/"+file, 'rb')
            doc = fp.read()
            tokens = [t for t in trigrams(tokenizer.tokenize(doc))]
            for word in featureList:
                if word in tokens:
                    features[word] = 1.0
                else:
                    features[word] = 0.0
            docPos[file] = ""
            docFeatures[file] = features

    sentiment = "neg"
    for file in os.listdir(dir+sentiment):
        if file.endswith(".txt"):
            features = {}
            sentiment = "neg"
            fp = open(dir+sentiment+"/"+file, 'rb')
            doc = fp.read()
            tokens = [t for t in trigrams(tokenizer.tokenize(doc))]
            for word in featureList:
                if word in tokens:
                    features[word] = 1.0
                else:
                    features[word] = 0.0
            docNeg[file] = ""
            docFeatures[file] = features

    f = FreqDist(featureList)
    featureList = [x for (x,f) in f.items()[:n]]
    allData = []
    
    for doc in docFeatures.keys():
        data = []
        count = 1
        if doc in docNeg.keys():
            val =['-1']
        if doc in docPos.keys():
            val =['1']
        for key in featureList:
            data.append("%s:%s" %(count, docFeatures[doc][key]))
            count +=1
        val.extend(data)
        allData.append(" ".join(val))
    # for doc in docFeaturesPos.keys():
    #     data =['+1']
    #     for key in featureList:
    #         data.append("%s:%s" %(count, docFeaturesPos[doc][key]))
    #         count +=1
    #     count = 1
    #     allData.append(" ".join(data))
    fVectorWriter = csv.writer(open(dir+fileout+".txt", 'wb'))
    for d in allData:
        print d
        fVectorWriter.writerow([d])
Exemplo n.º 48
0
            fp.close()

    sentiment = "neg"

    for file in os.listdir(dir+sentiment):
        if file.endswith(".txt"):
            fp = open(dir+sentiment+"/"+file, 'r')
            doc = fp.read()
            #tokens.extend(word_tokenize(doc))
            tokens.extend(tokenizer.tokenize(doc))
            fp.close()
    return tokens


dir = "/home/jch550/dev/JJboost/data/txt_sentoken/"
print "extracting features..."
featuresRaw = extractFeatures(dir)
# print "cleaning features..."
featuresClean = removeStopwords(featuresRaw)
featuresTrigrams = trigrams(featuresClean)
# print "writing to file..."
# fListWriter = csv.writer(open(dir+"featureTrigramsList.txt", 'w'))
# for f in featuresTrigrams:
#     fListWriter.writerow([f])

# features = open(dir+"featureTrigramsList.txt", 'rb')
# featuresList = features.read().split('\r\n')
featuresList = [t for t in featuresTrigrams]
print "extracting features from documents..."
extract(featuresList, dir, "docs_train_trigrams", 500)
print "DONE."
Exemplo n.º 49
0
		domain_words['HCF LCM']=['The','and','The','a' .......] 
		'''	
		domain_words[i] = temp
		for s in file_dumps[i].splitlines():
			question_collection.append((s,i))


print("++++++++++++++++++++++++++++++++++++++++++++++")
#for random_question in question_collection:
#	print(random_question[0])
#	print("-----")

tokensPerQuestion = [nltk.word_tokenize(random_question[0]) for random_question in question_collection]
tokensPerQuestion = [[token.lower() for token in t if token.lower() not in symbols and token.lower() not in stoplist] for t in tokensPerQuestion]
print(tokensPerQuestion[:5])

b = list(chain(*[(list(bigrams(tokens))) for tokens in tokensPerQuestion]))
t = list(chain(*[(list(trigrams(tokens))) for tokens in tokensPerQuestion]))

print(b)
fdist = nltk.FreqDist(b)
plt.figure(figsize=(20, 8))
# plot the top 20 bigrams
fdist.plot(30)

fdist = nltk.FreqDist(t)
plt.figure(figsize=(20, 8))
# plot the top 20 trigrams
fdist.plot(30)

		
Exemplo n.º 50
0
total_word_sents = word_training_set + word_test_set 
total_pos_sents = pos_training_set + pos_test_set
total_words =[]
total_pos = []

#This flatens the total word and total sentance lists
for (w, p) in zip(total_word_sents, total_pos_sents):
    for (word, pos) in zip(w, p):
        total_words.append(word)
        total_pos.append(pos)


# Calculates total ngram freq for words
total_word_unigrams = len(total_words)
total_word_bigrams = sum([len(list(bigrams(ngrams))) for ngrams in total_word_sents])
total_word_trigrams = sum([len(list(trigrams(ngrams))) for ngrams in total_word_sents])    
# Calculates total ngram freq for POS
total_pos_unigrams = len(total_pos)
total_pos_bigrams = sum([len(list(bigrams(ngrams))) for ngrams in total_pos_sents])
total_pos_trigrams = sum([len(list(trigrams(ngrams))) for ngrams in total_pos_sents])
#Prints the info calculated above
print("Words Total:\nUnigrams:%s\nBigrams:%s\nTrigrams:%s\n" % \
      (total_word_unigrams, total_word_bigrams, total_word_trigrams))
print("POS Total:\nUnigrams:%s\nBigrams:%s\nTrigrams:%s\n" % \
      (total_pos_unigrams, total_pos_bigrams, total_pos_trigrams))

# Calculates all the relevant stats given the training set, testing set,
# and data type
def getStats2(training_set, test_set, data_type):
    #List of ngrams for training list
    training_unigram = [unigram for sent in training_set for unigram in sent]