Пример #1
0
 def ngrams(self, ns=[2, 3, 5]):
     _p = ["/".join(t) for t in zip(self.SUF, self.POS)]
     for n in ns:
         ngf = {"Ngram(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(self.SUF, n)}
         ngfp = {"NgramP(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(_p, n)}
     self.features.update(ngf)
     self.features.update(ngfp)
Пример #2
0
 def update_freqs(self, doc_text, id_str):
     for bigram in list(ngrams(doc_text, 2)):
         k = bigram[0] + u"_" + bigram[1]
         self.bicount.update([k])
         self.bigram_to_ids[k] = self.bigram_to_ids.get(k, []) + [id_str]
     for trigram in list(ngrams(doc_text, 3)):
         k = trigram[0] + u"_" + trigram[1] + u"_" + trigram[2]
         self.tricount.update([k])
         self.trigram_to_ids[k] = self.trigram_to_ids.get(k, []) + [id_str]
def get_gram_ratio(w2v, text1, text2, n_grams_1=1, n_grams_2=1, n_jobs=1):
    t1 = list(ngrams(text1.split(), n_grams_1))
    t2 = list(ngrams(text2.split(), n_grams_2))
    pairs = list(iter_product(t1, t2, repeat=1))
    res = list(map(lambda x: similarity(w2v, x), pairs))
    if len(res) == 0:
        return 0
    else:
        return np.mean(res)
Пример #4
0
def ngrams_extract(string):
    if random.random() < SAMPLE_RATE:
        print '[*]',string
    l = list
    grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5))
    SIZE = 1024
    vec = zeros((SIZE,))
    for t in grams:
        vec[hash(t)%SIZE]+=1
    return log(vec+1.0)
Пример #5
0
 def build_ngram(source):
     ngram_set = {}
     for key, value in source.items():
         ngram = []
         for line in value:
             if IS_PAD:
                 ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL, pad_left=True, pad_right=True, pad_symbol='SSS'))
             else:
                 ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL))
         ngram_set[key] = ngram
     return ngram_set
def read_data(type):
    datapath = '../data/' + type + '/'
    data = {}
    maxindex = 500
    count = 0
    unigrams = []
    bigrams = []
    dependecies = []
    for c in string.ascii_uppercase:
        data[c] = {}
        for i in range(1, maxindex):
            filename = datapath + c + str(i)
            txtpath = filename + '.data'
            metapath = filename + '.meta'
            text = read_file(txtpath)

            meta = read_file(metapath)
            if text is not None:
                count += 1
                # print (count)
                data[c][i] = {'text': text[0], 'meta': parse_meta(meta)}
                tokens = nltk.word_tokenize(text[0])

                data[c][i]['tokens'] = tokens
                data[c][i]['length'] = len(tokens)
                s = remove_punct(text[0])
                tokens = nltk.word_tokenize(remove_punct(s.lower()))

                data[c][i]['unigrams'] = list(nltk.ngrams(tokens, 1))
                data[c][i]['bigrams'] = list(nltk.ngrams(tokens, 2))

                # data[c][i]['dependencies'] = dependency_parse(text[0])
                # deppath = filename + '.dep'
                # with open (deppath, 'w') as f:
                #     json.dump(data[c][i]['dependencies'],f)
                # with open (deppath, 'r') as f:
                #     data[c][i]['dependencies'] = json.load(f)


                unigrams.extend(data[c][i]['unigrams'])
                bigrams.extend(data[c][i]['bigrams'])
                # dependecies.extend(data[c][i]['dependencies'])

        data[c]['sequences'] = gen_sequences(data[c])
        data['unigram_model'] = create_model(unigrams, maxfeat=5000, minfreq=3)
        data['bigram_model'] = create_model(bigrams, maxfeat=5000, minfreq=3)
        # data['dependencies'] = create_model(dependecies, maxfeat=5000, minfreq=3)

    # pprint.pprint (data['unigram_model'])
    # pprint.pprint (data['bigram_model'])
    # pprint.pprint (data['dependencies'])

    # print(type, count)
    return data
Пример #7
0
    def extract_ngrams (self, memes):
        for meme_type in memes:
            for meme in memes[meme_type]:
                top_unigrams = meme[0]
                bottom_unigrams = meme[1]
                all_unigrams = top_unigrams + bottom_unigrams

                top_bigrams = ngrams (meme[0], 2)
                bottom_bigrams = ngrams (meme[1], 2)
                all_bigrams = top_bigrams + bottom_bigrams

                self.add_ngrams(key, top_unigrams, bottom_unigrams, all_unigrams, top_bigrams, bottom_bigrams, all_bigrams)
Пример #8
0
def get_gram_ratio(text1, text2, w2v, n_grams_1=1, n_grams_2=1, w=30, h=2000):
    arr = np.ndarray((w, h), np.float32)
    arr.fill(0)
    t1 = list(ngrams(text1.split(), n_grams_1))
    t2 = list(ngrams(text2.split(), n_grams_2))
    for i in range(len(t1)):
        for j in range(len(t2)):
            try:
                arr[i, j] = w2v.n_similarity(t1[i], t2[j])
            except:
                pass
    return arr
Пример #9
0
    def generate_location_vector(self, branch, index):
        if branch.text is not None:
            branch.text = branch.text.encode('ascii', 'ignore')

            if not branch.getchildren():
                sentences = branch.text.split('. ')
                for sentence in range(0, len(sentences)):
                    #sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
                    words = sentences[sentence].split()

                    for doc_word in range(0, len(words)):
                        word_location = (("{0}[{1}][{2}]".format(index, sentence, doc_word)), words[doc_word])
                        # any change in line below should be replicated in corpus.py also
                        symbols = ".,[]();:<>+=&+%!@#~?{}|"
                        whitespace = "                       "
                        replace = maketrans(symbols, whitespace)
                        doc_word = word_location[1].translate(replace)
                        doc_word = doc_word.lstrip()
                        doc_word = doc_word.rstrip()
                        if len(doc_word) > 1 and not len(doc_word) > 16:
                            self.doc_words.append(doc_word)

                    doc_bigrams = bigrams(words)
                    if not len(doc_bigrams) < 1:
                        doc_bigrams = self.n_gram_cleaner(doc_bigrams)
                        for bi_gram in doc_bigrams:
                            bi_gram = ' '.join(bi_gram)
                            self.bi_grams.append(bi_gram)

                    doc_trigrams = trigrams(words)
                    if not len(doc_trigrams) < 1:
                        doc_trigrams = self.n_gram_cleaner(doc_trigrams)
                        for tri_gram in doc_trigrams:
                            tri_gram = ' '.join(tri_gram)
                            self.tri_grams.append(tri_gram)

                    doc_fourgrams = ngrams(words, 4)
                    if not len(doc_fourgrams) < 1:
                        doc_fourgrams = self.n_gram_cleaner(doc_fourgrams)
                        for four_gram in doc_fourgrams:
                            four_gram = ' '.join(four_gram)
                            self.four_grams.append(four_gram)

                    doc_fivegrams = ngrams(words, 5)
                    if not len(doc_fivegrams) < 1:
                        doc_fivegrams = self.n_gram_cleaner(doc_fivegrams)
                        for five_gram in doc_fivegrams:
                            five_gram = ' '.join(five_gram)
                            self.five_grams.append(five_gram)

            else:
                for subtree in range(0, len(branch)):
                    LocationVector.generate_location_vector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
Пример #10
0
def lookup_phrases(sentence, noun_types, ignore_case=False):
    phrases = ngrams(sentence, 3) + ngrams(sentence, 2) + ngrams(sentence, 1)
    matches = []
    for phrase in phrases:
        if contains_noun(phrase):
            phrase_str = u' '.join(w.form for w in phrase)
            if ignore_case:
                phrase_str = phrase_str.lower()
            types = noun_types.get(phrase_str)
            if types:
                matches.append((phrase, types))
    return sorted(matches)
Пример #11
0
def get_top_ngrams_tfidf(text,collection,NGRAM=2,cutoff=100,docs=None):
    bigs = nltk.ngrams(text,NGRAM)
    print 'totally',len(bigs),'bigrams'
    bigs = remove_website_stopwords(bigs)
    freqdist = nltk.FreqDist(bigs)
    topwords = freqdist.keys()[:cutoff]
    # print len(topwords),'topwords:',topwords[:30],freqdist[topwords[0]],freqdist[topwords[1]]
    from math import log
    if True: #do_tfidf
	df = {}
	df_les = {}
	df_time = {}
	tfidf ={}
	for doc_id, text in docs.items():
	    words = [w for w in nltk.ngrams(text,NGRAM)]
	    les_id,time_id = doc_id.split(':')
	    time_id = time_id.replace('.csv','')
	    time_id = time_id[0:8]
	    for w in words:
		df.setdefault(w,set())
		df[w].add(doc_id)
		df_les.setdefault(w,set())
		df_les[w].add(les_id)
		df_time.setdefault(w,set())
		df_time[w].add(time_id)
        _cutoff=10000
        _topwords = freqdist.keys()[:_cutoff]
	df0,df1,df2={},{},{}
        for w in _topwords:
            # print w
	    try: df0[w] = len(df[w])
	    except: df0[w] = 0
	    try: df1[w] = len(df_les[w])
	    except: df1[w] = 0
	    try: df2[w] = len(df_time[w])
	    except: df2[w] = 0
	    tfidf[w] = freqdist[w]/(1+df0[w])
	# print df0
        #get sorted words in decreasing order of tfidf values
        sortedwords = sorted(tfidf.items(), key=itemgetter(1), reverse=True) 
        sortedwords = sortedwords[:cutoff]
        topwords = [w for w,s in sortedwords]
        sortedwords0 = sorted(df0.items(), key=itemgetter(1), reverse=True) 
        sortedwords1 = sorted(df1.items(), key=itemgetter(1), reverse=True) 
        sortedwords2 = sorted(df2.items(), key=itemgetter(1), reverse=True) 
        print 'TF-IDF topwords:'
        print len(topwords),'topwords:',sortedwords[:50],freqdist[topwords[0]],freqdist[topwords[1]]
	print sortedwords0[:30]
	print sortedwords1[:30]
	print sortedwords2[:30]
        return topwords,freqdist,df0,df1,df2
    return topwords,freqdist
Пример #12
0
 def __call__(self, words):
     grams = list(ngrams(words, 2)) + list(ngrams(words, 3))
     positives = [
         (i, len(gram), gram) for i, gram in enumerate(grams)
         if self.colls[len(gram)][gram]
     ]
     if not positives:
         return words
     positives.sort(key=lambda x: (x[1], len(words) - x[0]), reverse=True)
     matches, covered = self.__non_overlapping(positives)
     unigrams = [(i, w) for i, w in enumerate(words) if i not in covered]
     catted = sorted(matches + unigrams)
     return zip(*catted)[1]
Пример #13
0
    def generateLocationVector(self, branch, index):
        if branch.text is not None:
            branch.text = branch.text.encode('ascii', 'ignore')

            if not branch.getchildren():
                sentences = branch.text.split('. ')

                for sentence in range(0, len(sentences)):
                    #sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
                    words = sentences[sentence].split()

                    for word in range(0, len(words)):
                        word_location = (("{0}[{1}][{2}]".format(index, sentence, word)), words[word])
                        symbols = ",[]();:<>+=&+%!@#~?{}|"
                        whitespace = "                      "
                        replace = maketrans(symbols, whitespace)
                        spec_word = word_location[1].translate(replace)
                        spec_word = spec_word.lstrip()
                        spec_word = spec_word.rstrip()

                        if len(spec_word) > 1 and not len(spec_word) > 16:
                            self.spec_words.append(spec_word)

                    bi_grams = bigrams(words)
                    if not len(bi_grams) < 1:
                        for bi_gram in bi_grams:
                            bi_gram = ' '.join(bi_gram)
                            self.bi_grams.append(bi_gram)

                    tri_grams = trigrams(words)
                    if not len(tri_grams) < 1:
                        for tri_gram in tri_grams:
                            tri_gram = ' '.join(tri_gram)
                            self.tri_grams.append(tri_gram)

                    four_grams = ngrams(words, 4)
                    if not len(four_grams) < 1:
                        for four_gram in four_grams:
                            four_gram = ' '.join(four_gram)
                            self.four_grams.append(four_gram)

                    five_grams = ngrams(words, 5)
                    if not len(five_grams) < 1:
                        for five_gram in five_grams:
                            five_gram = ' '.join(five_gram)
                            self.five_grams.append(five_gram)                    

            else:
                for subtree in range(0, len(branch)):
                    Corpus.generateLocationVector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
def jacquard_fivegram(query):
    final=[]
    n=4
    for a in file('enwiktionary.a.list'):
        a=a.rstrip()
        fivegram=set(nltk.ngrams(a,5))
        q_fivegram=set(nltk.ngrams(query,5))
        intersect=q_fivegram.intersection(fivegram)
        union=q_fivegram.union(fivegram)
        sim=float(len(intersect))/len(union)
        
        final.append([a,sim])
    final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True)
    print final_sorted[:10]
Пример #15
0
    def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
        split_text = text.split()
        if len(split_text) < shingle_length:
            raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))

        self.minhash = []
        self.shingles = ngrams(split_text, shingle_length)

        for hash_seed in generate_random_seeds(minhash_size, random_seed):
            min_value = float('inf')
            for shingle in ngrams(split_text, shingle_length):
                value = mmh3.hash(' '.join(shingle), hash_seed)
                min_value = min(min_value, value)
            self.minhash.append(min_value)
Пример #16
0
 def train(self, words, tagged=False):
     if tagged is True:
         tags = []
         for i in range(len(words)):
             tags.append(words[i][1])
         self.ngrams = list(nltk.ngrams(tags, self.n))
     else:
         # text = nltk.word_tokenize(words)
         tagged_words = nltk.pos_tag(words)
         universal_tags = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged_words]
         self.ngrams = list(nltk.ngrams(universal_tags, self.n))
     self.frequencies = nltk.FreqDist(self.ngrams)
     self.probs_ng = nltk.MLEProbDist(self.frequencies)
     print self.probs_ng
def count_alliteration(tokens):
    allit_instances = []
    #ignore stopwords
    tokens = [token for token in tokens if not(is_punctuation(token) or is_stopword(token))]
    
    bigrams = nltk.ngrams(tokens,2)
    for one,two in bigrams:
        if has_alliteration(one,two):
            allit_instances.append((one,two))
    trigrams = nltk.ngrams(tokens,3)
    for one,two,three in trigrams:
        #the not avoids double counting
        if has_alliteration(one,three) and not has_alliteration(one,two):
            allit_instances.append((one,two,three))
    return len(allit_instances)
Пример #18
0
def calc_precision(n,translation, reference):
    total = 0
    correct = 0
    for i in range(min(len(translation),len(reference))):
        tra_ngrams = nltk.ngrams(translation[i].split(), n)
        ref_ngrams = nltk.ngrams(reference[i].split(), n)
        total += min(len(ref_ngrams),len(tra_ngrams))
        for ng in tra_ngrams:
            if(ng in ref_ngrams):
                correct += 1
    print("total:" + str(total)+ ", correct: "+ str(correct))
    if(total == 0):
        return(0)
    precision = float(correct)/total
    return(precision)
Пример #19
0
def get_date_from_utterance(tokenized_utterance: List[Token],
                            year: int = 1993) -> List[datetime]:
    """
    When the year is not explicitly mentioned in the utterance, the query assumes that
    it is 1993 so we do the same here. If there is no mention of the month or day then
    we do not return any dates from the utterance.
    """

    dates = []

    utterance = ' '.join([token.text for token in tokenized_utterance])
    year_result = re.findall(r'199[0-4]', utterance)
    if year_result:
        year = int(year_result[0])
    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for month, tens, digit in trigrams:
        # This will match something like ``september twenty first``.
        day = ' '.join([tens, digit])
        if month in MONTH_NUMBERS and day in DAY_NUMBERS:
            try:
                dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print('invalid month day')

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for month, day in bigrams:
        if month in MONTH_NUMBERS and day in DAY_NUMBERS:
            # This will match something like ``september first``.
            try:
                dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print('invalid month day')

    fivegrams = ngrams([token.text for token in tokenized_utterance], 5)
    for tens, digit, _, year_match, month in fivegrams:
        # This will match something like ``twenty first of 1993 july``.
        day = ' '.join([tens, digit])
        if month in MONTH_NUMBERS and day in DAY_NUMBERS and year_match.isdigit():
            try:
                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print('invalid month day')
        if month in MONTH_NUMBERS and digit in DAY_NUMBERS and year_match.isdigit():
            try:
                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[digit]))
            except ValueError:
                print('invalid month day')
    return dates
Пример #20
0
def generate(starting_point='i', crp=nltk.corpus.brown, ngram=2):
    words = nltk.corpus.brown.words(categories='news')

    ngrams = nltk.ngrams([w.lower() for w in words], ngram)

    cdf = nltk.ConditionalFreqDist(ngrams)
    # print cdf.viewitems(
    # for item in cdf.viewitems():
        # print item
    word = starting_point.lower()
    result = [word]

    while word not in [".", "?", "!", "'", ";", "`", "``"]:
        prev_word = result[-1]

        for new_word in cdf[word]:
            if new_word not in result[-len(result) / 2:]:
                prev_phrase = [prev_word, new_word]
                if not ' '.join(prev_phrase) in ' '.join(result):
                    word = new_word

        if word == result[-1]:
            break
        result.append(word)

    result = ' '.join(result)
    return result
Пример #21
0
def main():

    text = []
    with open("development.set", 'r') as filedata:
        for line in filedata:
            l = line.split()
            if len(l) >=6:
                text.append([l[0], l[1], l[2], l[3], l[4]])

    posTagger(text)
    entityTagger()

    # print(wiki_lookup("Barack Obama", "PERSON"))
    # class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
    #                    'stanford-ner/stanford-ner.jar')
    # print(class3.tag(["Barack Obama"]))
    # print(wordNettTagger("Barack Obama"))

    words = []
    with open("pos.tagged", 'r') as filedata:
        for line in filedata:
            l = line.split()
            if l[5] == "NN" or l[5] == "NNP":
                words.append(l[4])
        bigram_list = nltk.ngrams(words, 2)

    tagged_bigrams = ngramTagger(bigram_list)
    tagChecker(tagged_bigrams)
    locationCheck()
    wikification()
Пример #22
0
def POS_Ngram(N, example_set, i):
    N_grams = dict()
    count = 0
    for para in example_set:
        if i == 0: # get first sentence
            tokens = word_tokenize(para.first)
        else: # get ith sentence
            para.order_sentence()
            tokens = word_tokenize(para.ordered_sentences[i-1])
            #tokens = word_tokenize(para.scrambled_sentences[int(para.correct_order[i-1])-1])
        tagset = None
        #print(tokens)
        tokens = _pos_tag(tokens, tagset, tagger)

        tags = [x[1] for x in tokens] # take POS tags only

        n_tags = list(ngrams(tags, N))

        for tag_set in n_tags:
            count += 1
            if tag_set in N_grams:
                N_grams[tag_set] += 1
            else:
                N_grams[tag_set] = 1 # first occurence of tagset
    # Normalize N_gram counts by total number of N grams for this set of sentences
    for ngram, num in N_grams.items():
        N_grams[ngram] = num/count
    return N_grams
Пример #23
0
    def clean_up_txt(self, txt):
        # strip EOL, apostrophes, numbers, HTML, all other punctuation, and then
        # break into sentences
        ptn1=re.compile(r"""\ba\b|\ban\b|\bthe\b|\band\b|\bthat\b|\bthis\b|
        \bto\b|\bas\b|\bfor\b|\bof\b|\bin\b|\byou\b|\byour\b|\bbut\b|
        \bwith\b|\bon\b|\bis\b|\bby\b|\bfrom\b|\btheir\b|\bit\b|\bits\b|
        \btheir\b|\bor\b|\bat\b|\bwhich\b|\bcan\b|\binc\b|\bhas\b|\bhave\b|
        \balso\b|\bthan\b|\ball\b|\bbe\b|\bthey\b|\bwas\b|\bsuch\b|
        \binto\b""", re.X)
        ptn2=re.compile(r'\&#[0-9A-F]{4};')
        #words beginning with digits--get rid of digits
        ptn3=re.compile(r'\b[0-9]+')
        # end of clause or sentence to make into periods ,;:!?
        ptn4=re.compile(r'[!\?:;]')
        # other punctuation: get rid of
        ptn5=re.compile(r'[\"$\(\)&\/,]')
        # Break into sentences
        ptn6=re.compile(r'\.[ ]+(?=[A-Z])')

        TAG_RE = re.compile(r'<[^>]+>')
	txt = TAG_RE.sub("", txt.replace("\n"," ").encode('ascii','ignore').\
                         replace('\\/','/').replace("'",""))
	txt = ptn5.sub(" ",ptn4.sub(".",ptn3.sub(" ",ptn2.sub("",txt))))
	sents = ptn6.split(txt)

        grams = set([])
	for sent in sents:
	    new_sent = ptn1.sub("",sent.lower().replace("."," ")).split()
            # generate n-grams
            for n in range(2, self.max_ngrams+1):
                grams.update(set(ngrams(new_sent, n)))

        return grams
Пример #24
0
 def __fromcursor__(self):
     self.data = []
     for document in c['Body'][self.source].find({
         'term' : self.term,
         'date' : {'$gt' : self.start_date, '$lt' : self.stop_date},
         'str_type' : self.str_type.__name__,
         'n' : self.n
     }, {
     'documents' : 1
     }, no_cursor_timeout=True):
         for _id in document['documents']:
             comment = get_comment(_id, self.source)
             gram_list = []
             for ngram in ngrams(comment[self.str_type.__name__], self.n):
                 gram_list.append(Gram(ngram).term)
             if self.position:
                 loc = gram_list.index(self.term) + position
                 self[gram_list[loc]] + 1
             else:
                 gram_list.remove(self.term)
                 for gram in gram_list:
                     self[gram] += 1
     try:
         self * (sum(self) ** -1)
     except ZeroDivisionError:
         raise ValueError("No comments with term {} found".format(self.term))
     self.__tocollection__()
def generate_ngrams(line):
    result = []
    line = line.strip()
    for sentence in line_filter(' '.join(default_tokenize_func(line))):
        tokens_plain = []
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i+20), i, -1):
                token = ' '.join(sentence[i:j])
                if i+1 == j and i == 0:
                    # if first word in sentence -> do not attempt to link, could be wrong (Apple)
                    tokens_plain.append(token.lower())
                elif token in unambiguous_labels:
                    # TODO: check it doesn't span titles
                    uri = unambiguous_labels[token]
                    # get types
                    tokens_plain.append('<dbpedia:'+uri+'>')
                    i = j-1
                    break
            i += 1
        for n in range(1, N+1):
            for ngram in nltk.ngrams(tokens_plain, n):
                result.append((' '.join(ngram), 1))
    return result
Пример #26
0
    def mapper(self, _, line):
        
        N = 4

        filename = mrjob.compat.jobconf_from_env('map.input.file')
        filename = ntpath.basename(filename)
        # currently, the file name is like "595F_1852_01_01_0102.txt"
        # I just get rid of the last page number part "_0102.txt" 
        # to obtain the filename "595F_1852_01_01"
        fname = filename[:filename.rfind("_")]

        text = self.progPunctuation.sub(' ', line)
        tokens = text.split()
        toks = [w for w in tokens if self.progContainsALetterOrNumber.search(w)]

        #d = {}
        #for n in range(1, N+1):
        #    d[n] = {}

        for n in range(1, N+1):
            d = {}
            for ng in nltk.ngrams(toks, n):
                ngram = " ".join(ng)
                if ngram in d:
                    d[ngram] += 1
                else:
                    d[ngram] = 1
            # pickle
            for w, freq in d.items():
                yield (fname, n), (w, freq)
Пример #27
0
def ngrams_sentences(sentences, n):
    ngrams_serntences = []
    for sentence in sentences:
        for i in range(n - 1):
            sentence = ['start{}'.format(i)] + sentence + ['end{}'.format(i)]
        ngrams_serntences.append([gram for gram in ngrams(sentence, n)])
    return ngrams_serntences
Пример #28
0
def build_LM(in_file):
    """
    build language models for each label
    each line in in_file contains a label and an URL separated by a tab(\t)
    """
    print 'building language models...'
    # This is an empty method
    # Pls implement your code in below
    LM = {"malaysian" : Counter(),
          "indonesian": Counter(),
          "tamil"     : Counter()
          }

    #total_count before smoothing
    total_count = {"malaysian" : 0,
                   "indonesian": 0,
                   "tamil"     : 0
                  }
    num_4gram = {"malaysian" : 0,
                 "indonesian": 0,
                 "tamil"     : 0
                }

    with open(in_file, 'r') as f:
        #scan line by line to collect 4grams counts
        for line in f:
            label = line.split()[0]                                     #get the label
            string = rm_nonalphabet_char(line.split(' ', 1)[1].lower()) #get the actual text after the label

            fourgram = nltk.ngrams(string, 4)      #4grams
            fourgramDist = nltk.FreqDist(fourgram)
            LM[label] += fourgramDist
            # LM[label] += Counter([string[i:i+ngramL].lower() for i in range(0, len(string) - ngramL) if string[i:i+ngramL]])

        #add unseen 4grams from other languages to a specific language
        for label in LM.keys():
            other_labels  = [k for k in LM.keys() if k != label]

            for other in other_labels:
                #get all 4grams in table[other] but not in table[label] and give them value 0
                add_dict = {k: 0 for k in LM[label].keys() if not k in [other_k for other_k in LM[other].keys()]}
                #add all above 4grams to table[label]
                LM[other].update(add_dict)

        #total_count before smoothing
        total_count["malaysian"] = sum(LM["malaysian"].values())
        total_count["indonesian"] = sum(LM["indonesian"].values())
        total_count["tamil"] = sum(LM["tamil"].values())

        #number of unique 4grams in each label before smoothing
        num_4gram["malaysian"] = len(LM["malaysian"])
        num_4gram["indonesian"] = len(LM["indonesian"])
        num_4gram["tamil"] = len(LM["tamil"])

        #smoothing + 1 and convert to log base10 scale
        for label in LM.keys():
            for k in LM[label].keys():
                LM[label][k] = log(LM[label][k] + 1, 10) - log(total_count[label] + num_4gram[label], 10)

    return LM
Пример #29
0
def computeProbability(n, tags, sluice, table):
	result = 0.0
	count = 0

	# get ngrams and iterate over them to
	# see how many match in the table
	ngrams = list(nltk.ngrams(tags, n))
	for ngram in ngrams:
		count += 1
		if sluice not in table[str(n)]:
			sluiceKey = random.choice(table[str(n)].keys())
			while sluiceKey == "key":
				sluiceKey = random.choice(table[str(n)].keys())

			result += table[str(n)][sluiceKey].get(" ".join(list(ngram)), 0.0)
		else:
			result += table[str(n)][sluice].get(" ".join(list(ngram)), 0.0)

	# return a pseudoprobability which
	# is the average probability of 
	# all ngrams in this sentence; a number
	# which is always between 0 and 1
	if count == 0:
		return 0.0
	else:
		return result/count
Пример #30
0
    def __call__(self, feat_string): # generate features!
        base_feats = feat_string.split(TOKENSEP)
        
        # make ngrams from lemmatized words
        for n in xrange(self.min_ngram, self.max_ngram+1):
            for ng in nltk.ngrams([bf for bf in base_feats if bf.startswith("LEMM:") and bf[5:].lower() not in self.stop_words], n):
                s = ' '.join(map(lambda s: s.strip("LEMM:"), ng))
                if len(ng) > 1:
                    yield s
                else: # only strip stop words for 1-grams
                    if s not in self.stop_words:
                        yield s
        
        # use the original words, but not stop words
        #for bf in [bf for bf in base_feats if bf.startswith("ORIG:") and bf[5:].lower() not in self.stop_words]:
        #    yield bf
        
        # this will overlap with ORIG:, but if we don't use ORIG, then it'll work
        for bf in [bf for bf in base_feats if bf.startswith("NNP:")]:
            yield bf

        #for bf in [bf for bf in base_feats if bf.startswith("TENSE:")]:
        #    yield bf

        ##for bf in [bf for bf in base_feats if bf.startswith("SYN:")]:
        ##    yield bf
        
        # wiki categories
        for bf in [bf for bf in base_feats if bf.startswith("WIKICAT:")]:
            yield bf
Пример #31
0
def getFreqDist(text, n):
    ngramsObject = nk.ngrams(text, n)
    freqDist = nk.FreqDist(ngramsObject)
    return freqDist
Пример #32
0
#-	Using Lemmatization, apply lemmatization on the remaining words
lemmatizer = WordNetLemmatizer()
frm_lemma = []
for word in frm_word:
    fr_lema = lemmatizer.lemmatize(word.lower())
    frm_lemma.append(fr_lema)

print("\n -----------lemmetaizion----------  ")
print(frm_lemma)
frm_pos = pos_tag(frm_lemma)

print("--------------BIGRAM-------------")

n = 2
gram = []
bigrams = ngrams(frm_lemma, n)
for grams in bigrams:
    gram.append(grams)
print(gram)
str1 = " ".join(str(x) for x, y in frm_pos)
str1_word = word_tokenize(str1)
print("--------Bi-Grams with word  frequency----------")
fdist1 = nltk.FreqDist(gram)
top_fiv = fdist1.most_common()
top_five = fdist1.most_common(5)

top = sorted(top_fiv, key=itemgetter(0))
print(top)
print('---------Top 5 bi-grams word freq with count--------')
print(top_five)
sent1 = sent_tokenize(frm)
Пример #33
0
    def addTurn(self, turn):
        """
        Adds a turn to this tracker
        :param turn: The turn to process and add
        :return: A hypothesis of the current state of the dialog
        """

        hyps = copy.deepcopy(self.hyps)

        goal_stats = defaultdict(lambda: defaultdict(float))

        # Obtaining the best hypothesis from the ASR module
        best_asr_hyp = turn['input']["live"]['asr-hyps'][0]["asr-hyp"]

        # English stopwords set with punctuation
        stop = stopwords.words('english') + list(string.punctuation)

        # Tokenize the best hypothesis on the whitespaces
        tkns = word_tokenize(best_asr_hyp)

        # Remove stop words and also shingle the tokens
        processed_hyp = [word for word in tkns if word not in stop] + [tup[0] + " " + tup[1] for tup in ngrams(tkns, 2)]

        # Manually change from "moderately"/"affordable" to "moderate" and "cheaper" to "cheap"
        for idx, word in enumerate(processed_hyp):
            if word == "moderately" or word == "affordable":
                processed_hyp[idx] = "moderate"
            if word == "cheaper":
                processed_hyp[idx] = "cheap"

        if processed_hyp:

            # Obtain the ontology information
            pricerange_options = self.ontology["informable"]["pricerange"]
            food_options = self.ontology["informable"]["food"]
            area_options = self.ontology["informable"]["area"]

            state_updated = False

            # SIMPLE Matching
            # Iterate through all the words in the best asr hypothesis
            # If the word is present in the ontology update that slot with the word

            for hyp_word in processed_hyp:

                if hyp_word in food_options:
                    goal_stats["food"][hyp_word] += 1.0
                    state_updated = True

                if hyp_word in area_options:
                    goal_stats["area"][hyp_word] += 1.0
                    state_updated = True

                if hyp_word in pricerange_options:
                    goal_stats["pricerange"][hyp_word] += 1.0
                    state_updated = True

            # If this simple matching was not able to match anything then we will use BERT w/ cosine-similarity
            if not state_updated:

                # Use BERT to encode all the words in the sentence
                encoded_hyp = np.array(self.bc.encode(processed_hyp))

                # Use the cosine sim between the previous encoding and the encoded knowledge base
                cosine_sim = cosine_similarity(encoded_hyp, self.encoded_kb)

                for idx, sub_arr in enumerate(cosine_sim):

                    # For every word in the sentence obtain the word in the KB that maximizes the cosine sim
                    argmax_index = np.argmax(sub_arr)

                    # assuming that if it's lower than 0.97 then it's probably a mistake
                    # (Not many cases have 0.97 cosine sim, maybe none actually)
                    if sub_arr[argmax_index] >= 0.97:

                        kb_word = self.knowledge_base[argmax_index]
                        print(f"BERT: Word in query: {processed_hyp[idx]} \t matched with {kb_word}")

                        if kb_word in food_options:
                            goal_stats["food"][kb_word] += 1.0

                        if kb_word in area_options:
                            goal_stats["area"][kb_word] += 1.0

                        if kb_word in pricerange_options:
                            goal_stats["pricerange"][kb_word] += 1.0

            super(BertTracker, self).fill_goal_labels(goal_stats, hyps)
            super(BertTracker, self).fill_joint_goals(hyps)

        self.hyps = hyps
        return self.hyps
Пример #34
0
from nltk.book import *
from nltk import FreqDist
from nltk import bigrams
from nltk import ngrams

print(len(text6) / len(set(text6)))

fdist = FreqDist(text6)
result = fdist.most_common(20)
print(result)

bigrams = bigrams(text6)
bigramsDist = FreqDist(bigrams)
print(bigramsDist[('Sir', 'Robin')])

fourgrams = ngrams(text6, 4)
for fourgram in fourgrams:
    if fourgram[0] == 'coconut':
        print(fourgram)
Пример #35
0
for i, book in enumerate(books):
    parts = [file for file in filenames if book in file]
    parts.sort()

    for part in parts:
        with open(f"./outputs/sentences_extractor/{part}", "r") as file:

            for j, line in enumerate(file):
                line = re.sub(r",|;|:", "", line)
                line = line.lower()
                line = line.strip()

                if "ngrams" in args.case:
                    n_gram = int(args.case.replace("ngrams",""))
                    line_parts = [reduce(lambda acc, x: acc + " " + x, group) for group in nltk.ngrams(line.split(), n_gram)]
                else:
                    line_parts = line.split(" ")

                for expression in expressions.index:
                    count = line_parts.count(expression.lower())

                    if count > 0:
                        compilator_lines[expression][i] += [total_lines + j+1 for i in range(count)]

            total_lines += j+1
        print(f"{part} | {total_lines}")
    sentences_break.append(total_lines)


with open(f'data/proust-sentences_{args.case}.json', 'w') as fp:
Пример #36
0
text = soup.p.contents[0]

text_1 = text.lower()

text_2 = re.sub('\W', ' ', text_1)

from nltk import word_tokenize
from nltk import bigrams
from nltk import trigrams
from nltk import ngrams

text_3 = word_tokenize(text_2)

text_3_bi = bigrams(text_3)
text_3_tri = trigrams(text_3)
text_3_n = ngrams(text_3, 4)

stop_words = urlopen(
    'http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop'
).read().split('\n')

##we can then identify the stop words and then eliminate them from the list

##this is code that executes a very simple for loop to check the list
text_4 = [x for x in text_3 if x not in stop_words]

##you can check what was removed with:

text_rem = [x for x in text_3 if x not in text_4]

##we're going to use a similar format to apply various stemming/lemmatizing/synonyms algorithms
Пример #37
0
def cleaner(line):
    strippedList = re.sub(r'[^a-zA-Z ]+', ' ',
                          line.replace("'", "")).lower().replace(
                              "advertisement",
                              "").replace("\t",
                                          " ").strip().replace("\n", " ")
    strippedList = ' '.join([
        word for word in strippedList.split()
        if word not in stopwords.words('english')
    ])
    return strippedList


# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = cleaner(line).strip()

    # split the line into words

    bigrams = ngrams(line.split(), N)
    words = [" ".join(grams) for grams in bigrams]

    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        # tab-delimited; the trivial word count is 1
        print '%s\t%s' % (word, 1)
Пример #38
0
def getBigrams(tokens):
    LOGGER.debug("Bigrams...")
    return [g for g in ngrams(tokens, 2)]
Пример #39
0
    except Exception as e:
        print(str(e))


process_content()

# lemmatizing the data
print("lemmatizing:")

lem = []
for w in wordtokens:
    lem.append(lemmatizer.lemmatize(w))

print(lem)

#trigram the data
print("trigram")
print("")
sent = " i am studying in umkc which is a good university"
text = word_tokenize(sent)
trigram = ngrams(text, 5)
for t in trigram:
    print(t)

#Named Entity Recognition of text
print("Named Entity Recognition:")

NER = []
NER.append(ne_chunk(pos_tag(fileread)))
print(NER)
Пример #40
0
vocab = list(set(flatten(corpus)))
print(len(vocab))

word2index = {'<UNK>': 0}
for i, v in enumerate(vocab):
    if word2index.get(v) is None:
        word2index[v] = i + 1

index2word = {v: k for k, v in word2index.items()}

## Context and centers
WINDOW_SIZE = 5
win_pairs = flatten([
    list(
        nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE,
                    WINDOW_SIZE * 2 + 1)) for c in corpus
])

train_data = []
for word_pair in win_pairs:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE and word_pair[i] == '<DUMMY>':
            continue
        train_data.append((word_pair[WINDOW_SIZE], word_pair[i]))

X_tensor = []
y_tensor = []

for data in train_data:
    X_tensor.append(prepare_word(data[0], word2index).view(1, -1))
    y_tensor.append(prepare_word(data[1], word2index).view(1, -1))
Пример #41
0
from nltk import FreqDist
from nltk import Text
from nltk.book import text6

from nltk import ngrams

# 前十个的频率
# fdist = FreqDist(text6)
# print(fdist.most_common(10))


# 'Sir', 'Robin'出现的次数
fourgrms = ngrams(text6, 2)
fourgrmsDist = FreqDist(fourgrms)
print(fourgrmsDist['Sir', 'Robin'])
Пример #42
0
# lemmas
[x.lemma_ for x in doc]

# POS tags
[x.tag_ for x in doc]

# N-grams
from nltk import ngrams
from collections import Counter

# get n-gram counts for 10 documents
grams = []
for i, row in df.iterrows():
    tokens = row['text'].lower().split()  # get tokens
    for n in range(2, 4):
        grams += list(ngrams(tokens,
                             n))  # get bigrams, trigrams, and quadgrams
    if i > 50:
        break
Counter(grams).most_common()[:8]  # most frequent n-grams

# Tokenizers
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    min_df=0.001,  # at min 0.1% of docs
    max_df=.8,
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 3))
X = vec.fit_transform(df['text'])

# save the vectors
Пример #43
0
    def transform(self,
                  texts: List[str],
                  annotations: List[str] = []) -> np.ndarray:
        stopwords_enabled = 'stopwords' in self.features
        bigrams_enabled = 'bigrams' in self.features
        trigrams_enabled = 'trigrams' in self.features
        rare_pos_enabled = 'rare_pos_tags' in self.features
        annotation_enabled = 'annotation' in self.features
        vector_length = (stopwords_enabled * len(self.stopwords)) + \
                        (trigrams_enabled * len(self.trigrams_by_frequency)) + \
                        (bigrams_enabled * len(self.bigrams_by_frequency)) + \
                        (rare_pos_enabled * len(self.rare_pos_tags_by_frequency)) + \
                        (annotation_enabled * len(self.annotations))

        matrix = np.empty([len(texts), vector_length])
        for index, text in enumerate(texts):
            tokens = word_tokenize(text)
            vector = np.zeros(vector_length)

            if stopwords_enabled:
                for j, stopword in enumerate(self.stopwords['text']):
                    c = tokens.count(stopword)
                    vector[j] += c

            if bigrams_enabled:
                offset = (stopwords_enabled * len(self.stopwords))
                bigrams = list(ngrams(text, 2))
                for k, bigram in enumerate(self.bigrams_by_frequency):
                    c = bigrams.count(bigram)
                    vector[offset + k] += c

            if trigrams_enabled:
                offset = (stopwords_enabled * len(self.stopwords)) + (
                    bigrams_enabled * len(self.bigrams_by_frequency))
                trigrams = list(ngrams(text, 3))
                for l, trigram in enumerate(self.trigrams_by_frequency):
                    c = trigrams.count(trigram)
                    vector[offset + l] += c

            if rare_pos_enabled:
                offset = (stopwords_enabled * len(self.stopwords)) \
                         + (bigrams_enabled * len(self.bigrams_by_frequency)) \
                         + (trigrams_enabled * len(self.trigrams_by_frequency))
                pos_bigrams_per_text = list(ngrams(pos_tag(tokens), 2))
                for m, pos in enumerate(self.rare_pos_tags_by_frequency):
                    c = pos_bigrams_per_text.count(pos)
                    vector[offset + m] += c

            if annotation_enabled:
                offset = (stopwords_enabled * len(self.stopwords)) \
                         + (bigrams_enabled * len(self.bigrams_by_frequency)) \
                         + (trigrams_enabled * len(self.trigrams_by_frequency)) \
                         + (rare_pos_enabled * len(self.rare_pos_tags_by_frequency))

                matches = get_matches(annotations[index])
                for n, annotation in enumerate(self.annotations['text']):
                    c = matches.count(annotation)
                    vector[offset + n] += c

            matrix[index] = vector
        return matrix
Пример #44
0
def extract_bigrams(texts: List[str]) -> List[Tuple[str]]:
    bigrams = chain(*[ngrams(text, 2) for text in texts])
    bigram_frequency = Counter(bigrams)
    return [freq[0] for freq in bigram_frequency.most_common(100)]
Пример #45
0
def extract_features(document):
    n_gram = 3
    ngram_vocab = nltk.ngrams(document, n_gram)
    features = dict([(ng, True) for ng in ngram_vocab])
    return features
Пример #46
0
    b_values = []
    
    for item in data:
        
        text = functions.clean_text(item['full_text'])
        text = functions.give_emoji_free_text(text)
        text = functions.additional_remove(text)
        tokens = WhitespaceTokenizer().tokenize(text)
        tokens = [x.lower() for x in tokens]

        for t in tokens:
            if t in B:
                b_values.append((item['id_str'], item['created_at']))
                res.append(item['user']['screen_name']+'\t'+item['id_str']+'\t'+text+'\t'+item['created_at'])
    
        for i in ngrams(tokens, 2):
            if ' '.join([j for j in i]) in A:
                a_values.append((item['id'], item['created_at']))
                res.append(item['user']['screen_name']+'\t'+item['id_str']+'\t'+text+'\t'+item['created_at'])
    
    a_values_counter+=len(a_values)
    b_values_counter+=len(b_values)
    
    if len(b_values)>0 or len(a_values)>0:
        temp['B_values'] = b_values
        temp['A_values'] = a_values
        file_name = file_name.split('.',-1)[0]
        result[file_name] = temp
    sys.stdout.write('\r%d/%d'%(counter,num_of_files))

print("\nnumber of tweets;", tweets_counter)
            f['article'] = text  # update list json, add article content
    return files


result = get_aritcle('data')

texts = [t['article'] for t in result]
len(texts)

clean_ts = clean_text(result, 'data')

from nltk import ngrams, tokenize

token = tokenize.word_tokenize(' '.join(clean_ts))

ngm = ngrams(token, 2)

grams = list(set(token)) + [' '.join(list(n)) for n in list(ngm)]

import tensorflow as tf
import tensorflow_hub as hub

model_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
embed = hub.Module(model_url)

tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    embeding = session.run(embed(grams))
Пример #48
0
from nltk import ngrams, ne_chunk, wordpunct_tokenize, pos_tag

with open('output.txt', 'r', encoding='utf-8') as f:
    raw = f.read()
#Tokenization
wtokens = nltk.word_tokenize(raw)
words = [word.lower() for word in wtokens if word.isalpha()]
print(words)
#Adding tag
print(nltk.pos_tag(words))
lStem = LancasterStemmer()
print(
    "Lancaster Stemming :----------------------------------------------------- \n"
)
for tok in words:
    print(lStem.stem(str(tok)))
lemmatizer = WordNetLemmatizer()
print(
    "Lemmatization ------------------------------------------------------------:\n"
)
for tok in words:
    print(lemmatizer.lemmatize(str(tok)))
print("Trigrams --------------------------------------------:\n")
trigram = []
x = 0

trigram.append(list(ngrams(words, 3)))
print(trigram)
print("NER-------------------------------------\n")
print("NER : \n", ne_chunk(pos_tag(wordpunct_tokenize(str(words)))))
def get_ngrams(text, n):
    n_grams = nltk.ngrams(word_tokenize(text), n)
    return [' '.join(grams) for grams in n_grams]
with open('logos.txt', 'r', encoding="latin1") as myfile:
    my_string = myfile.read().replace('\n', '')

exclude = set(string.punctuation)
string = ''.join(ch for ch in my_string if ch not in exclude)

tokens = word_tokenize(string)
text = nltk.Text(tokens)

#array is the tuple of ngrams, array2 is the count of appearances, array1 is joined tuples, array 1 & 2 can be zipped into a dataframe
array = []
array2 = []

### length of n-grams is second argument to nltk.ngrams(tokens,length_goes_here)
bgs = nltk.ngrams(tokens, 2)
fdist = nltk.FreqDist(bgs)
for k, v in fdist.items():
    if v > 10:
        array.append(k)
        array2.append(v)

array1 = []
for i in range(len(array)):
    x = ' '.join(map(str, array[i]))
    array1.append(x)

df = pd.DataFrame({
    'phrase': array1,
    'count': array2
}).sort_values(by="count", ascending=False)
Пример #51
0
def makeMainList():
    """AUDIO_FILE = "/Users/kushamaharshi/Desktop/TERM PROJECT!/tp1/lastSavedFile.wav"
    r = sr.Recognizer()
    with sr.AudioFile(AUDIO_FILE) as source:
        audio = r.record(source)  # read the entire audio file
    
    
    # Speech recognition using Google Speech Recognition
    try:
        # for testing purposes, we're just using the default API key
        # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
        # instead of `r.recognize_google(audio)`
        testString = r.recognize_google(audio)
    
        #testString = r.recognize_google(audio)
        print("You said: " + testString)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))"""
        
        #above code from uberi, github
    """testString = "Species are different varieties of animals. They are formed over a long time. They can interbreed. Just need some words here. This happens only if you act like a bad person. If you have more money, due to increased sense of security, then you automatically have more happiness. They have differences in color, size, strength, gender roles, etcetera. There are three ways to do this. first, we can charge. Still talking about first here. still going bleh. Second, we can dance. Droning about second here. still going bleh. Hello people of earth. Third, we can sing. still going bleh. Singing like shit. Going on about the same old thing. Okay, moving on. Life is good. This can be done in four ways: swimming, charging, dancing, liking, making and bathing, if for whatever reason. This is a nice thing to do. Although there is a lot of controversy about this issue, it is still not acted upon by the government. Whenever I go on a walk, I like to get some food, milk and pizza.
    
    testString = "there are four types of biological diversity first is species diversity every ecosystem contains a unique collection of species all interacting with each other secondly genetic diversity describes how closely related the members of one species are in a given ecosystem third consider ecosystem diversity a region may have several ecosystems, or it may have one wide expanses of oceans or deserts would be examples of regions with low ecological diversity fourth is functional diversity understanding an ecosystem’s functional diversity can be useful to ecologists trying to conserve or restore damaged it okay let’s move on by examining the similarities and differences of different lineages that are related, scientists can determine most likely when the species diverged and evolved compared to when the common ancestor was around since biological species concept is dependent upon reproductive isolation of reproducing species it cannot necessarily be applied to a species that reproduces asexually the lineage species concept does not have that restraint and therefore can be used to explain simpler species that do not need a partner to reproduce the five types of species interactions are predation competition parasitism mutualism and commensalism to conclude showing a bit of math here five hundred seventy six is twenty four times twenty four"""
    
    testString = "Biological diversity in an environment is indicated by numbers of different species of plants and animals. Essentially you could say that there are four types of biological diversity. First is species diversity. Every ecosystem contains a unique collection of species, all interacting with each other. Secondly, genetic diversity describes how closely related the members of one species are in a given ecosystem. Third consider ecosystem diversity. A region may have several ecosystems, or it may have one. Wide expanses of oceans or deserts would be examples of regions with low ecological diversity. Fourth is functional diversity. Understanding an ecosystem’s functional diversity can be useful to ecologists trying to conserve or restore damaged it. Okay, let’s move on. By examining the similarities, likes and differences of different lineages that are related, scientists, researchers and explorers can determine most likely when the species diverged and evolved compared to when the common ancestor was around. Since biological species concept is dependent upon reproductive isolation of reproducing species, it cannot necessarily be applied to a species that reproduces asexually. The lineage species concept does not have that restraint and therefore can be used to explain simpler species that do not need a partner to reproduce. Species have five types: predation, competition, parasitism, mutualism and commensalism. To conclude,showing a bit of math here. Also note that two thousand seventy four added to sixty four is not equal to twenty four times twenty four right. I would also like to say this project was made possible thanks to the wonderful support of my TP mentor and all the faculty and staff of 15-112. Subsequently, I had super fun doing this!"
    
    testString1 = "biological diversity in an environment as indicated by numbers of different species of plants and animals essentially you could say that there are four types of biological diversity first is species diversity every ecosystem contains a unique collection of species all interacting with each other secondly genetic diversity describes how closely related the members of one species are in a given ecosystem third consider ecosystem diversity a region may have several ecosystems, or it may have one wide expanses of oceans or deserts would be examples of regions with low ecological diversity fourth is functional diversity understanding an ecosystem’s functional diversity can be useful to ecologists trying to conserve or restore damaged it okay let’s move on by examining the similarities and differences of different lineages that are related, scientists can determine most likely when the species diverged and evolved compared to when the common ancestor was around since biological species concept is dependent upon reproductive isolation of reproducing species it cannot necessarily be applied to a species that reproduces asexually the lineage species concept does not have that restraint and therefore can be used to explain simpler species that do not need a partner to reproduce the five types of species interactions are: predation, competition parasitism mutualism and commensalism to conclude showing a bit of math here. five hundred seventy six is twenty four times twenty four" 
    
    testString = convertToSymbols(testString)
    
    """data = {'text': testString}
    req = requests.post('http://bark.phon.ioc.ee/punctuator', data=data)
    punctuatedString = req.text
    print(punctuatedString)"""
    
    punctuatedString = testString
    
    tstart = sent_tokenize(punctuatedString)
    sstart = []
    toRemove = []
    for sent in tstart:
        #print("sent start")
        tg = nltk.word_tokenize(sent)
        for j in range(6, 3, -1):
            #print(str(j)+"grams")
            aGram = ngrams(tg, j)
            for i in aGram:
                #print(i)
                boolNFU = newFindUseless(i)
                #print()
                if(boolNFU==True):
                    toRemove.append(i)
                    
    #print("toREMOVE: ", toRemove)
    
    for remPhrase in toRemove:
        stringCon = ' '.join(remPhrase) + " "
        punctuatedString = punctuatedString.replace(stringCon, "")
        
    #punctuatedString = convertToSymbols(punctuatedString)
        
    finalPOSArray = tagPreprocess(punctuatedString)
    #print("fpA: ", finalPOSArray)
    ##
    global bifBubbles, bifIndices, bifJumps, bifPrecSents
    
    bifBubbles, bifIndices, bifJumps, bifPrecSents = checkBubBifA(finalPOSArray)
    
    ##
    
    #print('********************8bifBubbles!!!!!!!!!!: ', bifBubbles)
        
    testStrsentp = sent_tokenize(punctuatedString)
    
    testStrsentp = [nltk.word_tokenize(sent) for sent in testStrsentp]
    
    
    for aSentence in testStrsentp:
        finalLabelledArray.append(labeler(aSentence))
    
    
    indexi = 0
    
    while(indexi <= len(testStrsentp)-1):
        sentence = testStrsentp[indexi]
        
        if(indexi in bifIndices):
            #print("BIFHERE")
            bifIndex = bifIndices.index(indexi)
            #print("bifindex: ", bifIndex)
            bifYChange = getBifYChange(bifBubbles[bifIndex])
            bifYActual = getBifYMax(bifBubbles[bifIndex])
            #print("bify: ", bifYChange)
            curBubble = Bubble("bif", sentence, getColorProper(indexi), bifIndex, bifYChange, bifYActual)
            indexi+=1
            #print("bj", bifJumps)
            indexi+=bifJumps[bifIndex]
        else:
            chunkedSent = chunker(labeler(sentence))
            for chunk in chunkedSent:
                type, sent = chunk[0], chunk[1]
                
                curBubble = Bubble(type, sent, getColorProper(indexi))
            indexi+=1
Пример #52
0
# from nltk.tokenize import blankline_tokenize
# AI_Blank= blankline_tokenize(AI)
# print(len(AI_Blank))

from nltk.util import bigrams, trigrams, ngrams
string = "Topic sentences are similar to mini thesis statements. Like a thesis statement, a topic sentence has a specific main point. Whereas the thesis is the main point of the essay, the topic sentence is the main point of the paragraph. Like the thesis statement, a topic sentence has a unifying function. But a thesis statement or topic sentence alone doesn’t guarantee unity. An essay is unified if all the paragraphs relate to the thesis, whereas a paragraph is unified if all the sentences relate to the topic sentence. Note: Not all paragraphs need topic sentences. In particular, opening and closing paragraphs, which serve different functions from body paragraphs, generally don’t have topic sentences."
quote_token = nltk.word_tokenize(string)

quotes_bigram = list(nltk.bigrams(quote_token))
# print(quotes_bigram)

quotes_tigram = list(nltk.trigrams(quote_token))
# print(quotes_tigram)

quotes_ngram = list(nltk.ngrams(quote_token, 4))
# print(quotes_ngram)

from nltk.stem import PorterStemmer
pst = PorterStemmer()
# print(pst.stem("Having"))

# words_to_steam=["give","giving","given","gave"]
# for words in words_to_steam:
#     print( words ,":", pst.stem(words))

from nltk.stem import LancasterStemmer
lst = LancasterStemmer()
print(lst.stem("Having"))

# words_to_steam=["give","giving","given","gave"]
    for t in training:
        length_of_trianing = len(training)
        print("	", countert_training + 1, "/", length_of_trianing)
        countert_training = countert_training + 1
        filename = str(n) + "gramsfor" + t
        fw = open(filename, "a")
        print("		working on file", t, "...", end="")
        fr = open(t, "r")
        text = fr.read()
        fr.close
        print("done")
        data = []
        count = []
        print("		finding", n, "grams for", t, "...", end="")
        #time1 = time()
        all_grams = ngrams(text.split(), n)
        #print (time() - time1)
        print("done")
        #print (time() - time1)

        print("		counting the frequency of", n, "grams in", t, "...", end="")
        s_counter = 0
        data = []
        point_int = {}
        count = []
        index_count = 0
        #for grams in all_grams:

        #	if grams in data:			#increasing count
        #		ind=data.index(grams);
        #		count[ind]=count[ind]+1;
Пример #54
0
def Ex_gram(_data, num):
    data = " ".join(_data)
    n_grams = ngrams(nltk.word_tokenize(data), num)
    return [" ".join(grams) for grams in n_grams]
    def parse_snips_intent(self):
        """
        Parse original data.json into Snips NLU Engine Training Data in yaml format.
        Convert into yaml file through command line prompt :
        'snips-nlu generate-dataset en input-yaml-file > output-json-file'
        """

        # Get original data.json in DataFrame
        data_df = DataProcessing(
            f"{getcwd()}/data_lake/{self.json_arg}").retrieve_process_json()
        # Get list of Unique Intents
        intent_list = list(set(data_df["Intent"]))
        # Load SpaCy NLP Large Corpus
        spacy_nlp_engine = load('en_core_web_lg')
        # Init yaml object
        yaml = ruamel.yaml.YAML()
        # Set explicit start to True
        yaml.explicit_start = True
        # Parse by Intents
        for intent_name in intent_list:
            # yes and no are reserved values for yaml file.
            # To avoid parsing error, "_" is added before the intent name.
            if intent_name == "yes" or intent_name == "no":
                intent_dict = {"type": "intent", "name": f"{intent_name}s"}
            else:
                intent_dict = {"type": "intent", "name": intent_name}
            # Init Lists for Slots + Utterances
            slots_value_list = []
            utt_value_list = []
            # Subset current Intent Data
            subset_data = data_df[data_df["Intent"] ==
                                  intent_name].reset_index(drop=True)
            # Get current Intent Queries
            intent_query_words = list(subset_data["Query"])
            # Get the 4 grams and convert into a list
            word_ngrams = (pd.Series(ngrams(intent_query_words, 4))).to_list()
            # Random sample 80% of each Intent as training phrases for NLU Engine
            sample_ngrams = sample(word_ngrams, int(len(subset_data) * 0.8))
            # Start parsing each queries
            for phrases in sample_ngrams:
                # Join phrases back to one single sentence
                full_text = " ".join(phrases)
                # Parse Entity of the text through Spacy NLP Engine
                parse_phrases = spacy_nlp_engine(full_text)
                # Set slots
                if len(parse_phrases.ents) > 0:
                    # Get Entity Label and Text, if any
                    for nlp_entity in parse_phrases.ents:
                        entity_label = nlp_entity.label_
                        entity_text = nlp_entity.text
                        # Construct "slot" for name and entity
                        slot_entities = {
                            "name": entity_label,
                            "entity": entity_label
                        }
                        # Replace text with entity label
                        full_text = full_text.replace(
                            entity_text, f"[{entity_label}]({entity_text})")
                        # Store "utterances" from the ngram
                        utt_value_list.append(full_text)
                        # Store unique "slots"
                        if slot_entities not in slots_value_list:
                            slots_value_list.append(slot_entities)
            # Set slots in intent dictionary
            if len(slots_value_list) > 0:
                intent_dict["slots"] = slots_value_list
            # Set utterances in intent dictionary
            if len(utt_value_list) > 0:
                intent_dict["utterances"] = utt_value_list
            # If there's no utterances found, use the original ngrams
            else:
                intent_dict["utterances"] = [
                    " ".join(gram) for gram in sample_ngrams
                ]
            # Append into output yaml
            with open(f"{getcwd()}/data_lake/intent_ngram.yaml", "a") as file:
                yaml.dump(intent_dict, file)
import nltk
import time
from time import time
from nltk import ngrams

timestamp = time()
a = [1, 2, 3, 4, 5, 6, 7, 8, 1, 2]
single_grams = ngrams(a, 3)
adata = []
point_int = {}
count = []
index_count = 0
for grams in single_grams:
    try:
        if point_int[grams] >= 0:
            passing = point_int[grams]
    except:
        passing = -1
    if passing >= 0:
        ind = passing
        count[ind] = count[ind] + 1
    else:
        data.append(grams)
        count.append(1)
        point_int[grams] = index_count
        index_count = index_count + 1
print(point_int)
print(count)
print(time() - timestamp)
print(data)
#print(data)
Пример #57
0
    # Gera e filtra os tokens de cada arquivo
    data[file]['tokens'] = tokenize.word_tokenize(text)
    data[file]['tokens'] = [
        t.lower() for t in data[file]['tokens'] if t.lower() not in stopwords
    ]

    # Gera os dados de frequência dos tokens
    data[file]['freq_tokens'] = nltk.FreqDist(data[file]['tokens'])

    # Gera os dados dos 15 tokens mais frequentes
    top15 = data[file]['freq_tokens'].most_common(15)
    data[file]['freq_tokens_top15'] = top15

    # Gera os dados de frequência dos bigramas
    bigram = ngrams(data[file]['tokens'], 2)
    data[file]['freq_bigrams'] = nltk.FreqDist(bigram)

    # Gera os dados dos 15 bigramas mais frequentes
    top15 = data[file]['freq_bigrams'].most_common(15)
    data[file]['freq_bigrams_top15'] = top15

    # Gera os dados de frequência dos quadrigramas com palavra "life"
    quadrigram = [ng for ng in ngrams(data[file]['tokens'], 4) if 'life' in ng]
    data[file]['freq_quadrigrams_life'] = nltk.FreqDist(quadrigram)

    # Imprime frequência das palavras 'the' e 'that'
    print('\n{:20s} {:35s} {}'.format('Arquivo', 'Token', 'Frequência'))
    for word in ['the', 'that']:
        freq = data[file]['freq_tokens'][word]
        print('{:20s} {:35s} {:03}'.format(file, word, freq))
Пример #58
0
#SnowBallStemmer
sStem = SnowballStemmer('english')
print("SnowBall Stemming : \n")
for i in tokens[0:50]:
    print(sStem.stem(str(i)))

#PorterStemmer
pStem = PorterStemmer()
print("Porter Stemming : \n")
for i in tokens[0:50]:
    print(pStem.stem(str(i)))

# POS-tagging
print("Part of Speech Tagging :\n", pos_tag(word_tokenize(text)))

# Lemmatization
lemmatizer = WordNetLemmatizer()
print("Lemmatization :\n")
for tok in tokens[0:50]:
    print(lemmatizer.lemmatize(str(tok)))

# Trigram
print("Trigrams :\n")
trigram = []
for x in tokens[0:20]:
    trigram.append(list(ngrams(x, 3)))
print(trigram)

# Named Entity Recognition
print("NER : \n", ne_chunk(pos_tag(wordpunct_tokenize(str(tokens)))))
Пример #59
0
def getUnigrams(tokens):
    LOGGER.debug("Unigrams...")
    return [g for g in ngrams(tokens, 1)]
Пример #60
0
 def compute(sent: str, k: int) -> 'DistMetric':
     token_set = set()
     for token in ngrams(sent.split(), k):
         token_set.add(token)
     return DistMetric(len(token_set))