def features(message):
    singlegrams =  [i for i in message.split() if i not in stop]#Removingstopwords
    
    singlegramsrefined = []
    #Stemming the single words
    for k in singlegrams:
        r = stem(k, stemmer=LEMMA)
        singlegramsrefined.append(r)
    newmessage = " ".join(singlegramsrefined) 
    newmessage = re.sub("[^A-Za-z]", " ", newmessage)# Removing numbers
    newmessage = re.sub(r'[^\w]', ' ', newmessage)# Removing stopwords
    singlegrams= [i for i in newmessage.split()]

    singlegramsrefined2 = []
    
    for word in singlegrams:
        singlegramsrefined2.append(word)
        
    bigrams = ngrams(newmessage, n=2)#bigrams
    trigrams = ngrams(newmessage, n=3)#trigrams
    
    totalgrams = singlegramsrefined2 + bigrams + trigrams
    
    totalgrams = tuple(totalgrams)#tuple having single words, bigrams and trigrams
    return totalgrams
def features(message):
    #List of nltk stopwords
    stop = [u'i','diabetes','diabetic','type 2 diabetes','type 2', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves',
            u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers',
            u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who',
            u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have',
            u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or',
            u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between',
            u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in',
            u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where',
            u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some',
            u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can',
            u'will', u'just', u'don', u'should', u'now','m']
    singlegrams =  [i for i in message.split() if i not in stop]#Removingstopwords
    
    singlegramsrefined = []
    #Stemming the single words
    for k in singlegrams:
        r = stem(k, stemmer=LEMMA)
        if r not in stop:
            singlegramsrefined.append(r)
    newmessage = " ".join(singlegramsrefined) 
    newmessage = re.sub("[^A-Za-z]", " ", newmessage)# Removing numbers
    newmessage = re.sub(r'[^\w]', ' ', newmessage)# Removing non alphanumerics
    singlegrams= [i for i in newmessage.split() if len(i) > 1]

    singlegramsrefined2 = []
    
    for word in singlegrams:
        singlegramsrefined2.append(word)
        
    bigrams = ngrams(newmessage, n=2)#bigrams
    trigrams = ngrams(newmessage, n=3)#trigrams
    v = parsetree(newmessage, lemmata=True)[0]
    v = [w.lemma for w in v if w.tag.startswith(('NN'))]
    singlewords = []
    for i in v:
        stopping = stop +[u'hour',u'husband',u'anything',u'thing',u'way',u'n',u'number',u'person',u'd',u'x',u'dose',u'drug',u'today',u'help',u'everyone',u'bed',u'mine',u'bed',u'issue',u'anyone',u'thank' ,u'test', u'eat',u'something',u'doc',u'time',u'c',u'luck',u'lb',u'dr',u'morning','t',u'pill',u'upset',u'take',u'couple',u'month',u'use',u'exercise',u'diet',u'lot',u'vision','taking',u've',u'time',u'month',u'level',u'body',u'diet',u'food',u'release', u'time', u'meal',u'glipizide',u'week',
                          'type','yr',u'symptom',u'cause',u'tablet',u'blood',u'feel',u'like',
                          u'made',u'bad',u'work',u'still',
                          u'got',u'twice',u'i',u'mg',u'm',u'day',
                          u'sugar',u'taking',u'doctor',u'get',u'year',
                          u'side',u'went',u'med',u'one',u'better',
                          u'effect',u'problyear',u'side',u'went',u'med',u'one',u'better',u'effect',u'problem',u'also']
        if i not in stopping:
            singlewords.append(i)
    bi = []
    for r in bigrams:
        if r not in [(u'year', u'now'),(u'also', u'take'),(u'doesn', u't') ,(u'take', u'food'),(u'taking', u'metformin'),(u'i', u'diagnosed'),(u'metformin', u'mg'),(u'empty', u'stomach'),(u'couldn', u't'),(u'blood', u'sugar'),(u'diet', u'exercise'),(u'mg', u'x'),(u'type', u'diabetes'),(u'side', u'effect'),(u'i', u'm'),(u'i', u've'),(u'twice', u'day'),
                     (u'a', u'c'),(u'don', u't'),(u'slow', u'release'),(u't', u'take'),(u't', u'take'),
                     (u'good', u'luck'),(u'didn', u't'),(u'mg', u'twice'),(u'take', u'metformin'),(u'time', u'day'),
                     (u'went', u'away'),(u'year', u'ago'),(u'much', u'better'),(u'extended', u'release'),(u'started', u'taking'),
                     (u'can', u't'),(u'anyone', u'else'),(u'month', u'ago'),(u'mg', u'day')]:
            bi.append(r)      
   
    
    totalgrams = singlewords + bi
    
    
    return totalgrams
def featureExtractor(textMessage,countgrams):
    textMessage = textMessage.lower()
    #Function to remove stop words
    stopWords = [u'i','m', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself',
                 u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its',
                 u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom',
                 u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have',
                 u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or',
                 u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between',
                 u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on',
                 u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how',
                 u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only',
                 u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now']
   
    
    avoidList1 = ['diabetes','type 2','diabetic']
    avoidList = stopWords + avoidList1
    #Removing these stop words and general cleaning
    singleGrams =  [i for i in textMessage.split() if i not in avoidList]
    singlegramsRefined = []

    #Stemming the words for normalization
    for k in singleGrams:
        r = stem(k, stemmer=LEMMA)
        singlegramsRefined.append(r)
    newMessage = " ".join(singlegramsRefined) 
    newMessage = re.sub("[^A-Za-z]", " ", newMessage)# Removing numbers
    newMessage = re.sub(r'[^\w]', ' ', newMessage)# Removing all non alphanumeric chars
    singleGrams= [i for i in newMessage.split()] #Again splitting to single grams


    singlegramsRefined2 = [word for word in singleGrams] #Keep this now because it works
    biGrams = ngrams(newMessage, n=2)# Generating bigrams
    triGrams = ngrams(newMessage, n=3)#Generating trigrams

    totalGramsrefined = []
    if countgrams == 1:
        
        totalGrams = singlegramsRefined2
        
        totalGramsrefined = [i for i in totalGrams]# We want only those features in the text data which is in the model

    elif countgrams == 2:
        totalGrams = singlegramsRefined2+biGrams
        
        totalGramsrefined = [i for i in totalGrams]

    elif countgrams == 3:
        totalGrams = singlegramsRefined2+biGrams + triGrams
        
        totalGramsrefined = [i for i in totalGrams]
        

    return totalGramsrefined
Пример #4
0
 def getGrams(self, results):
     grams = {}
     for text, weight in results:
         uni = set(ngrams(text, n=1))
         bi = set(ngrams(text, n=2))
         tri = set(ngrams(text, n=3))
         for gram in uni:
             grams[gram] = grams.get(gram, 0) + weight
         for gram in bi:
             grams[gram] = grams.get(gram, 0) + weight
         for gram in tri:
             grams[gram] = grams.get(gram, 0) + weight
     return grams
Пример #5
0
 def getGrams(self, results):
     grams = {}
     for text, weight in results:
         uni = set(ngrams(text, n=1))
         bi = set(ngrams(text, n=2))
         tri = set(ngrams(text, n=3))
         for gram in uni:
             grams[gram] = grams.get(gram, 0) + weight
         for gram in bi:
             grams[gram] = grams.get(gram, 0) + weight
         for gram in tri:
             grams[gram] = grams.get(gram, 0) + weight
     return grams
Пример #6
0
 def test_ngrams(self):
     # Assert n-grams with and without punctuation marks / sentence marks.
     s = "The cat is napping."
     v1 = en.ngrams(s, n=2)
     v2 = en.ngrams(s, n=3, punctuation=en.PUNCTUATION.strip("."))
     self.assertEqual(v1, [("The", "cat"), ("cat", "is"), ("is", "napping")])
     self.assertEqual(v2, [("The", "cat", "is"), ("cat", "is", "napping"), ("is", "napping", ".")])
     s = "The cat purrs. The dog barks."
     v1 = en.ngrams(s, n=2)
     v2 = en.ngrams(s, n=2, continuous=True)
     self.assertEqual(v1, [("The", "cat"), ("cat", "purrs"), ("The", "dog"), ("dog", "barks")])
     self.assertEqual(v2, [("The", "cat"), ("cat", "purrs"), ("purrs", "The"), ("The", "dog"), ("dog", "barks")])
     print "pattern.en.ngrams()"
Пример #7
0
 def test_ngrams(self):
     # Assert n-grams with and without punctuation marks / sentence marks.
     s = "The cat is napping."
     v1 = en.ngrams(s, n=2)
     v2 = en.ngrams(s, n=3, punctuation=en.PUNCTUATION.strip("."))
     self.assertEqual(v1, [("The", "cat"), ("cat", "is"), ("is", "napping")])
     self.assertEqual(v2, [("The", "cat", "is"), ("cat", "is", "napping"), ("is", "napping", ".")])
     s = "The cat purrs. The dog barks."
     v1 = en.ngrams(s, n=2)
     v2 = en.ngrams(s, n=2, continuous=True)
     self.assertEqual(v1, [("The", "cat"), ("cat", "purrs"), ("The", "dog"), ("dog", "barks")])
     self.assertEqual(v2, [("The", "cat"), ("cat", "purrs"), ("purrs", "The"), ("The", "dog"), ("dog", "barks")])
     print("pattern.en.ngrams()")
Пример #8
0
def intertextuality(texts=[], n=5, continuous=False, weight=lambda ngram: 1):
    """ Returns a dictionary of (i, j) => float.
        For indices i and j in the given list of texts,
        the corresponding float is the percentage of text i that is also in text j.
        Overlap is measured by matching n-grams (by default, 5 successive words).
        An optional weight function can be used to supply the weight of each n-gram.
    """
    map = {} # n-gram => text id's
    sum = {} # text id => sum of weight(n-gram)
    for i, txt in enumerate(texts):
        for j, ngram in enumerate(ngrams(txt, n, continuous=continuous)):
            if ngram not in map:
                map[ngram] = set()
            map[ngram].add(i)
            sum[i] = sum.get(i, 0) + weight(ngram)
    w = defaultdict(Weight) # (id1, id2) => percentage of id1 that overlaps with id2
    for ngram in map:
        for i in map[ngram]:
            for j in map[ngram]:
                if i != j:
                    if (i,j) not in w:
                        w[i,j] = Weight(0.0)
                    w[i,j] += weight(ngram)
                    w[i,j].assessments.add(ngram)
    for i, j in w:
        w[i,j] /= float(sum[i])
        w[i,j]  = min(w[i,j], Weight(1.0))
    return w
Пример #9
0
def intertextuality(texts=[], n=5, continuous=False, weight=lambda ngram: 1):
    """ Returns a dictionary of (i, j) => float.
        For indices i and j in the given list of texts,
        the corresponding float is the percentage of text i that is also in text j.
        Overlap is measured by matching n-grams (by default, 5 successive words).
        An optional weight function can be used to supply the weight of each n-gram.
    """
    map = {}  # n-gram => text id's
    sum = {}  # text id => sum of weight(n-gram)
    for i, txt in enumerate(texts):
        for j, ngram in enumerate(ngrams(txt, n, continuous=continuous)):
            if ngram not in map:
                map[ngram] = set()
            map[ngram].add(i)
            sum[i] = sum.get(i, 0) + weight(ngram)
    w = defaultdict(
        Weight)  # (id1, id2) => percentage of id1 that overlaps with id2
    for ngram in map:
        for i in map[ngram]:
            for j in map[ngram]:
                if i != j:
                    if (i, j) not in w:
                        w[i, j] = Weight(0.0)
                    w[i, j] += weight(ngram)
                    w[i, j].assessments.add(ngram)
    for i, j in w:
        w[i, j] /= float(sum[i])
        w[i, j] = min(w[i, j], Weight(1.0))
    return w
Пример #10
0
def qTypeDetect(question):
    q_ext = extractHelper(question)
    bi_q = ngrams(question,n=2)
    Wh = ('what','where','who','whose','which','when','how')
    How = ('how many','how tall','how much','how old','how far','how long','how often')
    Binary = ('be','have','may','can')
    
    # check how:
    for a,b in bi_q:
        if (a+' '+b).lower() in How:
            return 'HOW'
    
    #find Root first:
    q = nlp(q_ext.decode('ascii'))
    head = findRoot(q[0])
    #print head
    for child in head.children:
        if child.lemma_ in Wh:
            return child
        for child2 in child.children:
            if child2.lemma_ in Wh:
                return child2
    if q[0] == head:
        return 'Binary'
    for child in head.children:
        tag = child.tag_
        pos = child.pos_
        #print child
        if child.lemma_ == "be" or child.lemma_ == "do" or tag == "VBZ" or tag == "VBP" or tag == "MD":
            return "Binary"
        if pos == "NOUN" or tag == "VB" or tag == "VBN":            
            return 'Complex'
Пример #11
0
 def ngram_count(self):
     grams = ngrams(self.text, n=self.total)
     for gram in grams:
         if gram in self.ngramcount:
             self.ngramcount[gram] += 1
         else:
             self.ngramcount[gram] = 1
    def get_queries(self):
        text = self.text
        beg_quotes = re.findall(r'\"\S', text)
        for each in beg_quotes:
            text = text.replace(each, 'BEGQ' + each[-1])

        end_quotes = re.findall(r'\S\"', text)
        for each in end_quotes:
            text = text.replace(each, each[0] + 'ENDQ')

        text = re.sub('(ENDQ)+', 'ENDQ', text)
        text = re.sub('(BEGQ)+', 'BEGQ', text)
        text = text.replace('--', 'DOUBLEDASH')

        all_ngrams = ngrams(text, n=self.span, punctuation="", continuous=True)
        stop_words = stopwords.words('english')
        queries = []

        for ngram in all_ngrams:
            num_stop = len([w for w in ngram if w in stop_words])
            stop_score = float(num_stop) / len(ngram)

            chunked = ne_chunk(pos_tag(ngram))
            named_entities = [[w for w, t in elt] for elt in chunked
                              if isinstance(elt, nltk.Tree)]
            num_ent = sum([len(ent_list) for ent_list in named_entities])
            ent_score = float(num_ent) / len(ngram)

            if stop_score < self.threshold and ent_score < self.threshold:
                r_string = self.reconstruct_ngram(ngram)
                if r_string in self.text:
                    queries.append(r_string)

        reduction = len(queries) / self.max_queries
        return queries[0::reduction]
Пример #13
0
 def ngram_count(self):
     grams = ngrams(self.text, n=self.total)
     for gram in grams:
         if gram in self.ngramcount:
             self.ngramcount[gram] += 1
         else:
             self.ngramcount[gram] = 1
def test_OhanaBrendan_extract_pos_tagged_element():

	doc = Document('sample', document_sample_text, 0)

	tagged = parse(doc.raw_text, chunks=False)
	doc.unigrams = ngrams(tagged, n=1)
	classifier = OhanaBrendan([doc])
	jj_elements = classifier._extract_pos_tagged_element(doc, 'JJ')
	nns_elements = classifier._extract_pos_tagged_element(doc, 'NNS')
	assert len(jj_elements) == 1
	assert len(nns_elements) == 3
Пример #15
0
    def get_queries(self):
        """Function to extract search queries from the text:
        breaks text into ngrams, filters ngrams that consist mostly of stopwords or named entities,
        selects an evenly spaced sample of the remaining ngrams"""

        text = self.text
        beg_quotes = re.findall(r'\"\S', text)
        for each in beg_quotes:
            text = text.replace(each, 'BEGQ' + each[-1])

        end_quotes = re.findall(r'\S\"', text)
        for each in end_quotes:
            text = text.replace(each, each[0] + 'ENDQ')

        text = re.sub('(ENDQ)+', 'ENDQ', text)
        text = re.sub('(BEGQ)+', 'BEGQ', text)
        text = text.replace('--', 'DOUBLEDASH')

        all_ngrams = ngrams(text, n=self.span, punctuation="", continuous=True)
        if self.language in stopwords.fileids():
            stop_words = stopwords.words(self.language)
        else:
            stop_words = []
        queries = []
        queries.append(self.text)
        for ngram in all_ngrams:
            num_stop = len([w for w in ngram if w in stop_words])
            stop_score = float(num_stop) / len(ngram)
            if self.language == 'english':
                chunked = ne_chunk(pos_tag(ngram))
                named_entities = [[w for w, t in elt] for elt in chunked
                                  if isinstance(elt, nltk.Tree)]
                num_ent = sum([len(ent_list) for ent_list in named_entities])
                ent_score = float(num_ent) / len(ngram)
            else:
                ent_score = 0

            if stop_score < self.threshold and ent_score < self.threshold:
                r_string = self.reconstruct_ngram(ngram)
                if r_string in self.text:
                    queries.append(r_string)

        reduction = len(queries) / self.max_queries
        if reduction == 0:
            reduction = 1
        return queries[0::reduction]
Пример #16
0
    def n_grams(self, s, n=2):
          '''
            Obtain n-grams

            In:
                (s:string, n:int) text string and size of the n-gram
            Out:
                (list) list of n-grams
          '''
          list = []
          ngrams_list = ngrams(s, n=n)
          for ngram in ngrams_list:
              ngram_joined = ''
              for word in ngram:
                  ngram_joined += word + ' '
              list.append(ngram_joined.rstrip())
          if len(list)>=1:
              return list
          else:
              return []
import sys
from pattern.en import ngrams
import videogrep
from collections import Counter

videofile = sys.argv[1]
subtitlefile = videofile.replace('.mp4', '.srt')

lines = open(subtitlefile).read()
grams = ngrams(lines, n=3)

most_common = Counter(grams).most_common(1)

phrase = most_common[0][0]
phrase = ' '.join(phrase)

print phrase

outputfile = videofile + '.most_common.mp4'
videogrep.videogrep([videofile], outputfile, phrase, 're')

Пример #18
0
print(pluralize('leaf'))
print(singularize('theives'))

# ### Converting Adjective to Comparative and Superlative Degrees

from pattern.en import comparative, superlative

print(comparative('good'))
print(superlative('good'))

# ### Finding N-Grams

from pattern.en import ngrams

print(ngrams("He goes to hospital", n=2))

# ### Finding Sentiments

from pattern.en import sentiment

print(sentiment("This is an excellent movie to watch. I really love it"))

# Explanation:
#
# - 0.75 show the sentiment score of the sentence that means highly positive
# - 0.8 is the subjectivity score that is a personal of the user

# ### Checking if a Statement is a Fact

from pattern.en import parse, Sentence
Пример #19
0
Possessives = [["i", "my", "me", "mine"], ["she", "her", "hers"],
               ["he", "him", "his"], ["you", "your", "yours"], ["it", "it's"],
               ["we", "us", "our", "ours"],
               ["they", "them", "their", "theirs"]]
Poss = [
    "i", "my", "me", "mine", "she", "her", "hers", "he", "him", "his", "you",
    "your", "yours", "it", "it's", "we", "us", "our", "ours", "they", "them",
    "their", "theirs"
]
Articles = ["a", "an", "the"]
Prepositions = ["in", "on", "at"]

instr = input()
var = instr.lower()
# token=nltk.word_tokenize(var)
trigrams = ngrams(var, 3)


def query(v, i):
    encoded_query = urllib.parse.quote(v)
    params = {
        'corpus': 'eng-us',
        'query': encoded_query,
        'topk': 10,
        'format': 'tsv'
    }
    params = '&'.join('{}={}'.format(name, value)
                      for name, value in params.items())

    response = requests.get('https://api.phrasefinder.io/search?' + params)
Пример #20
0
        # Unigrams
        for words in train_tokens[0:]:
            for word in words:
                if word in unigrams:
                    unigrams[word] += 1
                else:
                    unigrams[word] = 1
        # unigrams prob

        for unigram in unigrams:
            unigramsProb[unigram] = float(unigrams[unigram])

        # Bigrams
        for word in train_tokens[0:]:
            x = ngrams(" ".join(word), 2)
            for bg in x:
                cfd[bg[0]][bg[1]] += 1
        cfd = ConditionalFreqDist(
            (bg[0], bg[1])
            for bg in list(chain(*[bigrams(i) for i in train_tokens])))
        if (idx >= 90):
            for bgidx, bg in enumerate(
                    list(chain(*[bigrams(i) for i in train_tokens]))):
                prob = cfd[bg[0]].freq(bg[1])
                prob = 0.0001 if not prob else prob
                bigramsProb[bg] = prob
    if (idx >= 90):
        test_tokens.append(tokens)
        if idx >= 190:
            for bgidx, bg in enumerate(
Пример #21
0
	for tup in bigram:
		txt = " ".join(tup)
		#print txt
		m = search(txt, sent)
		if m:
			entity.append(txt)
		#print m

	return entity

if __name__ == "__main__":
	s = "watch simpsons"
	if len(sys.argv) > 1:
		s = sys.argv[1]

	bigram = ngrams(s, n=2)
	sres = gsearch.search_google(s)
	cmd = ["last channel", "previous channel","tune to", "turn to", "watch"]
	sres += cmd
	#print sres
	sent = ""
	for item in sres:
		sent += " - " + item

	res = nlp(bigram, sent)
	if len(res) > 0:
		print res
	else:
		onegram = ngrams(s, n=1)
		res = nlp(onegram, sent)
		print res
Пример #22
0
def ngrams(text, max_n=1, min_n=1):
    for i in xrange(min_n-1,max_n):
        for n in en.ngrams(text, n=i+1):
            yield ' '.join(n)
def featureExtractor(textMessage,countgrams):
    textMessage = textMessage.lower()
    #Function to remove stop words
    stopWords = [u'i','m', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now']
    avoidList1 = ["actos", "pioglitazone hydrochloride", "pioglitazone",  "glustin", "glizone", "pioz", "zactos"]

    avoidList2 = ["medformin","metfornin","metforin","glucophage", "metformin", "glucophage xr", "metformin hydrochloride","carbophage sr", "riomet", "fortamet", "glumetza", "obimet", "gluformin", "dianben", "diabex", "diaformin", "siofor","metfogamma", "riomet","diformin","metformi","metphormin","metaforming","metfirman","metoformin","metfomin"]

    avoidList3 = ["byetta", "bydureon", "exenatide","byetta"]

    avoidList4 = ["victosa","victoza", "liraglutide", "saxenda","victoza"]

    avoidList5 = ["invokana", "invokana","canagliflozin"]

    avoidList6 = ["avandia", "rosiglitazone"]

    avoidList7 = ["insu","humalog","levimir","novolog","insuline","insulin glargine","insulins","lantus", "toujeo", "abasaglar", "basaglar","insulin","insulins","levamir","levemir"]

    avoidList8 = ["sitagliptin", "janumet", "januvia", "juvisync","junuvia","januvia","sitaglipton"]

    avoidList9 = ["amaryl", "glimepiride", "gleam", "k-glim-1", "glucoryl",  "glimpid", "glimy","ameryl"]
    
    avoidList10 = ['diabetes','type 2','diabetic']
    avoidList = stopWords + avoidList1 + avoidList2 + avoidList3 + avoidList4 + avoidList5 + avoidList6 + avoidList7 + avoidList8 + avoidList9 + avoidList10
    #Removing these stop words and general cleaning
    singleGrams =  [i for i in textMessage.split() if i not in avoidList]
    singlegramsRefined = []

    #Stemming the words for normalization
    for k in singleGrams:
        r = stem(k, stemmer=LEMMA)
        singlegramsRefined.append(r)
    newMessage = " ".join(singlegramsRefined) 
    newMessage = re.sub("[^A-Za-z]", " ", newMessage)# Removing numbers
    newMessage = re.sub(r'[^\w]', ' ', newMessage)# Removing all non alphanumeric chars
    singleGrams= [i for i in newMessage.split()] #Again splitting to single grams


    singlegramsRefined2 = [word for word in singleGrams] #Keep this now because it works
    biGrams = ngrams(newMessage, n=2)# Generating bigrams
    triGrams = ngrams(newMessage, n=3)#Generating trigrams
    listModelfeatures = modelFeatures()
    totalGramsrefined = []
    if countgrams == 1:
        
        totalGrams = singlegramsRefined2
        
        totalGramsrefined = [i for i in totalGrams if i in listModelfeatures]# We want only those features in the text data which is in the model

    elif countgrams == 2:
        totalGrams = singlegramsRefined2+biGrams
        
        totalGramsrefined = [i for i in totalGrams if i in listModelfeatures]

    elif countgrams == 3:
        totalGrams = singlegramsRefined2+biGrams + triGrams
        
        totalGramsrefined = [i for i in totalGrams if i in listModelfeatures]
        

    return totalGramsrefined
Пример #24
0
# In[47]:

# Open the choosen news articles and extract the main text
for selected_links in unique_links:
    results_url = selected_links
    #print results_url

    results = requests.get(results_url)
    results_text = BeautifulSoup(results.text, "lxml")
    #print(results.text)
    #break
    extract_text = results_text.find(class_='arti-flow')
    try:
        data = json.loads(
            results_text.find('script', type='application/ld+json').text)
    except:
        continue
    #print(data['description'])
    #print(extract_text)
    #print(data)
    #final_text = extract_text.get_text()
    final_text = data['description']
    #print(final_text)

    # Pre-processing the extracted text using ngrams function from the pattern package
    final_text1 = ngrams(final_text,
                         n=1,
                         punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_",
                         continuous=False)
    print(final_text1)
Пример #25
0
#pip install pattern
from pattern.en import parse
from pattern.en import pprint
parse('Hello Everyone and Welcome to Analytics India Magazine')
#parse function differentiate the words in the sentence as a noun, verb, subject, or subject. We can also use the ‘pprint’ function defined in the pattern library to display the parsed sentence in a clear manner.
pprint(
    parse('Hello Everyone and Welcome to Analytics India Magazine',
          relations=True,
          tokenize=True,
          lemmata=True))

#%% ngrams
# "n" combination of words in a sentence.
from pattern.en import ngrams
print(ngrams("Hello Everyone and Welcome to Analytics India Magazine", n=3))
print(ngrams("He goes to hospital", n=2))

#sentiment
#Sentiment refers to an opinion or feeling towards a certain thing. sentiment object is used to find the polarity (positivity or negativity) of a text along with its subjectivity.

from pattern.en import sentiment
print(sentiment("He is a good boy but sometimes he behaves miserably"))

print(sentiment("he is has done extremely well"))
print(sentiment("This is an excellent movie to watch. I really love it"))
#The sentence "This is an excellent movie to watch. I really love it" has a sentiment of 0.75, which shows that it is highly positive. Similarly, the subjectivity of 0.8 refers to the fact that the sentence is a personal opinion of the user.

#%%%Modality
#Checking if a Statement is a Fact; The modality function returns a value between -1 to 1. For facts, the modality function returns a value greater than 0.5
Пример #26
0
k=count(s, stemmer=LEMMA, exclude=['.', ','])
f3.write('\n'.join("{!s}={!r}".format(key,val) for (key,val) in k.items()))
exclude=[' ', '/', '.', ',', ';', ':', '!', '?', '(', ')', '[', ']', '{', '}', '\'', '`', '"', '@', '#', '$', '*', '+', '-', '|', '=', '~', '_', '...']
'''

dict = {}

from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

for article in articles:
    text = TextBlob(article)
    for sentence in text.sentences:
        s = Sentence(parse(''.join(sentence), lemmata=True))
        for i in range (1, len(s.words)):
            k = ngrams(' '.join([word for word in s.lemmata if word not in cachedStopWords]), n=i, punctuation=".,;:!?()[]{}`'\"@#$^&*+-|=~_", continuous=False)
            if i not in dict:
                dict[i] = {}
            for kgram in k:
                if i > 1:
                    key = '#'.join(unkgram for unkgram in kgram)
                else:
                    key = kgram[0]
                if key in dict[i]:
                    dict[i][key] += 1
                else:
                    dict[i][key] = 1

for dgram in dict:
    html.make_graph(dict[dgram], dgram, 1000)
    for key in dict[dgram]:
Пример #27
0
print(PAST, 1, PL) in tenses('purred')
# rule-based conjugation
print 'google' in verbs.infinitives
print 'googled' in verbs.inflections
print conjugate('googled', tense=PARTICIPLE, parse=False)
print conjugate('googled', tense=PARTICIPLE, parse=True)
# quantification
print number("seventy-five point two")  # "seventy-five point two" => 75.2
print numerals(2.245, round=2)  # 2.245 => "two point twenty-five"
print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify({'carrot': 100, 'parrot': 20})
print quantify('carrot', amount=1000)
# spelling
print suggest("parot")
# n-grams
print ngrams("I am eating pizza.", n=2)  # bigrams
print ngrams("I am eating pizza.",
             n=3,
             punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_",
             continuous=False)
# parser
print parse(
    'I eat pizza with a fork.',
    tokenize=True,  # Split punctuation marks from words?
    tags=True,  # Parse part-of-speech tags? (NN, JJ, ...)
    chunks=True,  # Parse chunks? (NP, VP, PNP, ...)
    relations=False,  # Parse chunk relations? (-SBJ, -OBJ, ...)
    lemmata=False,  # Parse lemmata? (ate => eat)
    encoding='utf-8',  # Input string encoding.
    tagset=None)  # Penn Treebank II (default) or UNIVERSAL.
# parser tagger and tokenizer
Пример #28
0
import sys
from pattern.vector import Document, count, words
from pattern.en import ngrams

text = sys.stdin.read()

total = int(sys.argv[1])
grams = ngrams(text, n=total)
ngramcount = {}
for gram in grams:
    if gram in ngramcount:
        ngramcount[gram] += 1
    else:
        ngramcount[gram] = 1

for gram in sorted(ngramcount, key=ngramcount.get, reverse=True):
    count = ngramcount[gram]
    if count > 10 and all(len(x) > 0 for x in gram):
        print str(count) + ': ' + ' '.join(gram)

Пример #29
0
Файл: util.py Проект: evie/sift
def ngrams(text, max_n=1, min_n=1, strip_punctuation=True):
    pattern_args = {} if strip_punctuation else {'punctuation': ''}
    for i in range(min_n - 1, max_n):
        for n in en.ngrams(text, n=i + 1, **pattern_args):
            yield ' '.join(n)
Пример #30
0
		# Unigrams
		for words in train_tokens[0:]:
			for word in words:
				if word in unigrams:
					unigrams[word] += 1 
				else:
					unigrams[word] = 1
		# unigrams prob
		
		for unigram in unigrams:
			unigramsProb[unigram] = float(unigrams[unigram])

		# Bigrams
		for word in train_tokens[0:]:
			x = ngrams(" ".join(word), 2)
			for bg in x:
				cfd[bg[0]][bg[1]] += 1
		cfd = ConditionalFreqDist((bg[0],bg[1]) for bg in list(chain(*[bigrams(i) for i in train_tokens])))
		if (idx >= 90):
			for bgidx, bg in enumerate(list(chain(*[bigrams(i) for i in train_tokens]))):
			    prob = cfd[bg[0]].freq(bg[1])
			    prob = 0.0001 if not prob else prob
			    bigramsProb[bg] = prob
	if (idx >= 90):
		test_tokens.append(tokens)
		if idx >= 190:
			for bgidx, bg in enumerate(list(chain(*[bigrams(i) for i in test_tokens]))):
			    prob = cfd[bg[0]].freq(bg[1])
			    prob = 0.0001 if not prob else prob
			    bigramsProb[bg] = prob
Пример #31
0
print(conjugate('googled', tense=PARTICIPLE, parse=True))

from pattern.en import quantify

print(quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']))
print(quantify({'carrot': 100, 'parrot': 20}))
print(quantify('carrot', amount=1000))

from pattern.en import quantify

print(quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']))
print(quantify({'carrot': 100, 'parrot': 20}))
print(quantify('carrot', amount=1000))

from pattern.en import ngrams
print(ngrams("I am eating pizza.", n=2))  # bigrams

from pattern.en import parse
print(parse('I ate pizza.').split())

from pattern.en import parsetree

s = parsetree('The cat sat on the mat.', relations=True, lemmata=True)
print(repr(s))

for sentence in s:
    for chunk in sentence.chunks:
        print(chunk.type), [(w.string, w.type) for w in chunk.words]

from pattern.en import sentiment
Пример #32
0
#
print
print lexeme('run')
print lemma('running')
print conjugate('purred', '3sg')
print PAST in tenses('purred')  # 'p' in tenses() also works.
print(PAST, 1, PL) in tenses('purred')

print 'Quantification'

print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify('carrot', amount=90)
print quantify({'carrot': 100, 'parrot': 20})

print 'ngrams'
print ngrams("I am eating a pizza.", n=2)

#parse
s = parse('I eat pizza with a fork.')
pprint(s)

#tag
for word, t in tag('The cat felt happy.'):
    print word + ' is ' + t

s = "The movie attempts to be surreal by incorporating various time paradoxes, but it's presented in such a ridiculous way it's seriously boring."
print sentiment(s)
print polarity(s)
print subjectivity(s)

#The modality() function returns a value between -1.0 and +1.0, expressing the degree of certainty
#
print 
print lexeme('run')
print lemma('running')
print conjugate('purred', '3sg')
print PAST in tenses('purred') # 'p' in tenses() also works.
print (PAST, 1, PL) in tenses('purred') 

print 'Quantification'

print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify('carrot', amount=90)
print quantify({'carrot': 100, 'parrot': 20})

print 'ngrams'
print ngrams("I am eating a pizza.", n=2)


#parse
s = parse('I eat pizza with a fork.')
pprint(s)

#tag
for word, t in tag('The cat felt happy.'):
    print word +' is ' +t     
    
s = "The movie attempts to be surreal by incorporating various time paradoxes, but it's presented in such a ridiculous way it's seriously boring."    
print sentiment(s)     
print polarity(s)
print subjectivity(s)
Пример #34
0
	return preprocess_fields_v2.isValidPhrase(tokens) and len(tokens) == otc

if __name__ == '__main__':
	reload(sys)  
	sys.setdefaultencoding('utf8')
	parser = optparse.OptionParser()
	parser.add_option("-c", "--count", dest = "doNgrams", action = "store_true", default = False,
					  help = "collect ngrams")
	parser.add_option("-a", "--acronyms", dest = "doAcronyms", action = "store_true", default = False,
					  help = "collect acronym/expansion pairs")
	parser.add_option("-k", "--nMin", dest = "nMin", type="int",
					  help = "min value of n (ngram or acronym size in tokens)")
	parser.add_option("-n", "--nMax", dest = "nMax", type="int",
					  help = "max value of n (ngram or acronym size in tokens)")
	parser.add_option("-s", "--src", dest = "srcFileName",
					  help = "source file")
	(options, args) = parser.parse_args()
	nMin = options.nMin if options.nMin else 1
	nMax = options.nMax if options.nMax else 3
	fileName = options.srcFileName
	for line in preprocess_fields_v2.fileToList(fileName):
		if options.doNgrams:
			for k in range(nMin, nMax + 1):
				for ngram in ngrams(line, k):
					phrase = ' '.join(ngram)
					nvp = preprocess_fields_v2.normalizeAndValidatePhrase(phrase, phraseValidator = partial(noStopWordValidator, otc = k))
					if nvp is not None: print phrase
		elif options.doAcronyms:
			for (acro, tokens) in acronymizePhrase(line, nMin, nMax): 
				print '|'.join([acro, ' '.join(tokens)])
Пример #35
0
from pattern.en import ngrams
from collections import Counter
from videogrep import videogrep
from sys import argv

filename = argv[1]
srt_name = filename.replace('.mp4', '.srt')

lines = open(srt_name).read()
grams = ngrams(lines, n=3)
# print grams

most_common = Counter(grams).most_common(10)
search_phrase = most_common[0][0]
search_phrase = ' '.join(search_phrase)
# for phrase in most_common:
# print phrase
# print ' '.join(phrase[0])

videogrep([filename], filename + '.most_common.mp4', search_phrase, 're')
Module that handles background work such as dividing
data into training, cross-validation, and test sets
'''

import time
import matplotlib.pyplot as plt
from pattern.en import ngrams
from pattern.en import lemma

start=time.clock()
#text='Hi my name is be Jason'
textFile=open('text.txt','r')
text=''
for line in textFile:
        text+=line
words=ngrams(text,n=1)
words=[str(lemma(word[0])) for word in words]
length=len(words)
words.sort()
wordFreq=dict()
count=0
'''
for i in range(len(words)):
	if i==len(words)-1:
		continue
	elif words[i]!=words[i+1]:
		wordFreq[words[i]]=count+1
		count=0
	else:
		count+=1
'''