示例#1
0
	def getDomainUnigram(self, directory = None):		
		collocations = set()  #collocation items
		ewordlists = list() #list of lists of words
		
		#extract words from essays
		if directory is not None:
			doclist = os.listdir(directory)
			for essay in doclist:
				dir_essay  = directory+'/'+essay
				etext = open(dir_essay,'r').read()
				tokens = nltk.wordpunct_tokenize(etext)
				tokens = [word.lower() for word in tokens]
				#stemming
				if self._stemoption ==True:
					st = PorterStemmer()
					tokens = [st.stem(t) for t in tokens]
				
				#extract the collocation for the given essay
				e_bigram = set(Mytext(tokens).collocations())
				collocations = collocations | e_bigram
				ewordlists.append(tokens)
				
		else: # using the mapped essay to calcuate the candidate bigrams
			#need to call mapessay fuction first
			for ins in self._data:
				if ins['essay'] is not None:
					etext = open(ins['essay'],'r').read()
					tokens = nltk.wordpunct_tokenize(etext)
					tokens = [word.lower() for word in tokens]
					#stemming
					if self._stemoption ==True:
						st = PorterStemmer()
						tokens = [st.stem(t) for t in tokens]
				
					#extract the collocation for the given essay
					e_bigram = set(Mytext(tokens).collocations())
					collocations = collocations | e_bigram
					ewordlists.append(tokens)
		
		#get collection of all essays under the specified directory / associated essays
		collection_text = TextCollection(ewordlists)
		
		itemlist = list()
		for (a, b) in collocations:
			itemlist.append(a)
			itemlist.append(b)
			
		itemlist = list(set(itemlist))	
		
		word_idf = []
		for i in range(len(itemlist)):
			word_idf.append((collection_text.idf(itemlist[i]), itemlist[i]))	
		
		word_idf = sorted(word_idf, key = operator.itemgetter(0))
		ave = 0
		if len(word_idf)!=0:
			ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf)
			
		wlist =  [j for (i, j) in word_idf if i<ave]				
		return wlist
示例#2
0
文件: run2.py 项目: gitzain/project-x
	def do_it(self, sources):

		for source in sources:
			words = nltk.wordpunct_tokenize(source.headline)
			words.extend(nltk.wordpunct_tokenize(source.summary))
			lowerwords=[x.lower() for x in words if len(x) > 1]
			self.ct += 1
			print self.ct, "TITLE",source.headline
			self.corpus.append(lowerwords)
			self.titles.append(source.headline)
			self.links.append(source.url)



		[[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus]

		self.ct=-1
		for doc in self.corpus:
		   self.ct+=1
		   print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus))



		for document in self.corpus:
			vec=[]
			[vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list]
			self.feature_vectors.append(vec)



		self.n=len(self.corpus)

		mat = numpy.empty((self.n, self.n))
		for i in xrange(0,self.n):
		  for j in xrange(0,self.n):
			mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j])


		Z = linkage(mat, 'single')

		dendrogram(Z, color_threshold=self.t)





		clusters = self.extract_clusters(Z,self.t,self.n)
		
		stories = []

		for key in clusters:
			print "============================================="
			story = Story()  
			for id in clusters[key]:
				story.add_source(sources[id])
				print id,self.titles[id],sources[id].url
			stories.append(story)


		return stories
def get_utterances(utterances, line, category, wgram, cgram):
    tknzr = TweetTokenizer()
    gram_list = []
    # WORD GRAMS
    if wgram == 1:  # unigram
        wgram_list = tknzr.tokenize(line)
    elif wgram == 2:  # uni + bigram
        # unigram list
        tokens = nltk.wordpunct_tokenize(line)
        # bigram list
        finder = BigramCollocationFinder.from_words(tokens)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
        bigram_list = sorted(bigram for bigram, score in scored)
        # res
        wgram_list = tknzr.tokenize(line) + bigram_list
    elif wgram == 3: # uni + bi + trigram
        # unigram list
        tokens = nltk.wordpunct_tokenize(line)
        # bigram list
        bi_finder = BigramCollocationFinder.from_words(tokens)
        bi_scored = bi_finder.score_ngrams(bigram_measures.raw_freq)
        bigram_list = sorted(bigram for bigram, biscore in bi_scored)  
        # trigram list
        tri_finder = TrigramCollocationFinder.from_words(tokens)
        tri_scored = tri_finder.score_ngrams(trigram_measures.raw_freq)
        trigram_list = sorted(trigram for trigram, triscore in tri_scored)
        # res
        wgram_list = tknzr.tokenize(line) + bigram_list + trigram_list
    
    # CHAR GRAMS
    cgram_list = []
    if cgram == 1:   # uni-chargram
        cgram_list = [line[i:i+1] for i in range(len(line)-1)]
    elif cgram == 2: # bi-chargram
        cgram_list = [line[i:i+2] for i in range(len(line)-1)]
    elif cgram == 3: # tri-chargram
        cgram_list = [line[i:i+3] for i in range(len(line)-1)]
        
    # RESULT
    if category == 'QA':            # non-task
        utterances.append((wgram_list + cgram_list, 0))
    elif category == 'Shopping':    # task
        utterances.append((wgram_list + cgram_list, 1))
    elif category == 'Travel':      # task
        utterances.append((wgram_list + cgram_list, 2))
    elif category == 'Hotel':       # task
        utterances.append((wgram_list + cgram_list, 3))
    elif category == 'Food':        # task
        utterances.append((wgram_list + cgram_list, 4))
    elif category == 'Art':         # task
        utterances.append((wgram_list + cgram_list, 5))
    elif category == 'Weather':     # task
        utterances.append((wgram_list + cgram_list, 6))
    elif category == 'Friends':     # task
        utterances.append((wgram_list + cgram_list, 7))
    elif category == 'Chat':        # chat
        utterances.append((wgram_list + cgram_list, 8))
    else:
        print utt_category,"ERROR"
def getArticleKeywords(articles, maxLength=3):
    """ Parse titles of a number of articles and extract keywords that occur
    in them. A keyword is defined as a grouping of several words, with punctuation
    and stopwords (*nltk.corpus.stopwords.words('english')*) removed. Will 
    also add keywords from every input Article into the corresponding entry
    in articles list.
    
    Arguments
    ----------
    articles - a list of Articles.
    maxLength - int, the largest number of tokens per keyword.
    
    Returns
    ----------
    2-tuple with numpy.ndarrays of shape (len(articles),) with
        * strings of keywords
        * ints with the number of occurrences of the given keyword in all titles
    
    Example
    ----------
    "A general theory of the plasma of an arc" would return keywords:
        ['A', 'general', 'theory', 'of', 'the', 'plasma', 'of', 'an', 'arc',
        'A general', 'general theory', 'theory of', 'of the', 'the plasma',
        'plasma of', 'of an', 'an arc', 'A general theory', 'general theory of',
        'theory of the', 'of the plasma', 'the plasma of', 'plasma of an', 'of an arc']
    Out of these, ['A','of','the','an','of the','of an'] would be filtered out.
    """
    
    # Identify keywords.
    tokens=[]
    for title in [art.Title for art in articles]:
        tokens.extend(nltk.wordpunct_tokenize(title))
    
    # Filter out meaningless words and punctuation.
    tokens=filter(lambda s: not s.lower() in nltk.corpus.stopwords.words('english') and
        not s in string.punctuation, tokens)

    # Find keywords (length 1, 2, or 3) and how often they occur in all the titles.
    keywords,frequencies=findNGrams(tokens,lengths=range(1,maxLength+1))
    keywords=numpy.array(keywords)
    frequencies=numpy.array(frequencies)
    sortedIndices=frequencies.argsort()[::-1] # Go in descending order of frequencies.
    frequencies=frequencies[sortedIndices]
    keywords=keywords[sortedIndices]

    # Assign keywords to Articles.
    for i in range(len(articles)):
        artTitleTokens=nltk.wordpunct_tokenize(articles[i].Title) # The tokens of this article's title.
        # Filter out meaningless words and punctuation.
        artTitleTokens=filter(lambda s: not s.lower() in nltk.corpus.stopwords.words('english') and
            not s in string.punctuation, artTitleTokens)
        
        # Use the same algorithm but for this article only.
        artKeywords,artFreq=findNGrams(artTitleTokens,lengths=[1,2,3])
        articles[i].Keywords=artKeywords
    
    return keywords,frequencies
def product_features(product):
    name = nltk.FreqDist(normalize_words(nltk.wordpunct_tokenize(product['name'])))
    desc = nltk.FreqDist(normalize_words(nltk.wordpunct_tokenize(product['description'])))
    feats = {}
    for word in name.keys():
        feats['name(%s)' % word] = True

    for word in desc.keys():
        feats['description(%s)' % word] = True
    return feats
示例#6
0
  def do_it(self):

    for feed in self.feeds:
        d = feedparser.parse(feed)
        for e in d['entries']:
           words = nltk.wordpunct_tokenize(self.clean_html(e['description']))
           words.extend(nltk.wordpunct_tokenize(e['title']))
           lowerwords=[x.lower() for x in words if len(x) > 1]
           self.ct += 1
           print self.ct, "TITLE",e['title']
           self.corpus.append(lowerwords)
           self.titles.append(e['title'])
           self.links.append(e['link'])



    [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus]

    self.ct=-1
    for doc in self.corpus:
       self.ct+=1
       print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus))



    for document in self.corpus:
        vec=[]
        [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list]
        self.feature_vectors.append(vec)



    self.n=len(self.corpus)

    mat = numpy.empty((self.n, self.n))
    for i in xrange(0,self.n):
      for j in xrange(0,self.n):
        mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j])


    Z = linkage(mat, 'single')

    dendrogram(Z, color_threshold=self.t)





    clusters = self.extract_clusters(Z,self.t,self.n)
     
    for key in clusters:
       print "============================================="  
       for id in clusters[key]:
           print id,self.titles[id]
示例#7
0
def jaccard(sen_1, sen_2):
  tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_1))
  words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]

  sen_set_1 = set(words)

  tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_2))
  words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]

  sen_set_2 = set(words)

  jaccard_value = jaccard_distance(sen_set_1, sen_set_2)
  return jaccard_value
示例#8
0
def main():
  stem = nltk.stem.LancasterStemmer()
  cleanword = lambda w : stem.stem(w.strip(w).lower())
  bib = btparse.load(sys.argv[1])
  aid = np.random.randint(len(bib))
  while ('abstract' in bib[aid].keys()) == False:
    aid = np.random.randint(len(bib))
  
  abstract = nltk.wordpunct_tokenize(bib[aid]['abstract']+" "+bib[aid]['title'])
  q_vec0 = sorted([x[0] for x in nltk.pos_tag(abstract) if x[1] in ("NN")])
  
  q_vec = []
  q_val  = []
  for w in q_vec0:
    w = cleanword(w)
    if len(w)>2 and w not in ignore_list and re.search('\\\\',w) == None:
      if (w in q_vec) == False:
        q_vec.append(w)
        q_val.append(1)
      else:
        q_val[-1] += 1
  
  q_val = np.array(q_val)/np.sqrt(np.dot(q_val,q_val))
  prob = np.zeros(len(bib))
  
  if pytools:
    progress = pytools.ProgressBar("Analysing",len(bib))
    progress.draw()
  for ind,entry in enumerate(bib):
    if ind != aid and ('abstract' in bib[ind].keys()):
      abstract = nltk.wordpunct_tokenize(bib[ind]['abstract']+" "+bib[ind]['title'])
      r_vec = sorted([x[0] for x in nltk.pos_tag(abstract) if x[1] in ("NN")])
      r_val = np.zeros(len(q_val))
      for w in r_vec:
        w = cleanword(w)
        if w in q_vec:
          r_val[q_vec.index(w)] += 1
      mod = np.dot(r_val,r_val)
      if mod > 0:
        prob[ind] = np.dot(r_val/np.sqrt(mod),q_val)
    if pytools: progress.progress()
  if pytools: print ""
  
  # sort based on probability (best first)
  inds_sort = np.argsort(prob)[::-1]
  
  print 'similar papers to:\n\t%s\n\t\tby: %s\n'%(bib[aid]['title'],bib[aid]['author'])
  for i in range(10):
    best = inds_sort[i]
    print '%3d.\t%s\n\t\tby: %s\n\t\tid = %3d, prob = %f\n'%(i+1,bib[best]['title'],bib[best]['author'],best,prob[best])
示例#9
0
def feedTech(request):
    corpus = []
    titles=[]
    ct = -1
    for feed in feeds:
        d = feedparser.parse(feed)
        for e in d['entries']:
            words = nltk.wordpunct_tokenize((e['description']))
            words.extend(nltk.wordpunct_tokenize(e['title']))
            lowerwords=[x.lower() for x in words if len(x) > 1]
            ct += 1
            print (ct, "TITLE",e['title'])
            corpus.append(lowerwords)
            titles.append(e['title'])
    return render(request, 'dash/feeds.html')
示例#10
0
def tag_files_for_cross_validation(file_list, tmp_models):
    # first clean CV files folder
    if os.path.exists(CV_FILES_PATH_DEFAULT):
        shutil.rmtree(CV_FILES_PATH_DEFAULT)
    if os.path.exists(CV_FILES_PATH_PUNCT):
        shutil.rmtree(CV_FILES_PATH_PUNCT)
    if os.path.exists(CV_FILES_PATH_LOWER):
        shutil.rmtree(CV_FILES_PATH_LOWER)
    if os.path.exists(CV_FILES_PATH_LOWER_PUNCT):
        shutil.rmtree(CV_FILES_PATH_LOWER_PUNCT)

    # then create new CV folders
    os.makedirs(CV_FILES_PATH_DEFAULT)
    os.makedirs(CV_FILES_PATH_PUNCT)
    os.makedirs(CV_FILES_PATH_LOWER)
    os.makedirs(CV_FILES_PATH_LOWER_PUNCT)

    for file_name in file_list:
        path = ORIGINAL_STORIES + '/' + file_name + '.txt'

        if not os.path.isfile(path):
            print('File ' + path + ' does not exist!')
            continue

        content = get_content(path)
        content_lower = content.lower()
        tokenized_content = nltk.wordpunct_tokenize(content)
        tokenized_content_punct = nltk.word_tokenize(content)
        tokenized_content_lower = nltk.wordpunct_tokenize(content_lower)
        tokenized_content_lower_punct = nltk.word_tokenize(content_lower)

        tagged_content = tag_tokens_with_model(tokenized_content, tmp_models.default, lowercase=False, message=False)
        tagged_file_path = CV_FILES_PATH_DEFAULT + '/' + file_name + '.tsv'
        write_tagged_content_to_file(tagged_content, tagged_file_path, message=False)

        tagged_content = tag_tokens_with_model(tokenized_content_punct, tmp_models.punct, lowercase=False,
                                               message=False)
        tagged_file_path = CV_FILES_PATH_PUNCT + '/' + file_name + '.tsv'
        write_tagged_content_to_file(tagged_content, tagged_file_path, message=False)

        tagged_content = tag_tokens_with_model(tokenized_content_lower, tmp_models.lower, lowercase=True, message=False)
        tagged_file_path = CV_FILES_PATH_LOWER + '/' + file_name + '.tsv'
        write_tagged_content_to_file(tagged_content, tagged_file_path, message=False)

        tagged_content = tag_tokens_with_model(tokenized_content_lower_punct, tmp_models.lower_punct, lowercase=True,
                                               message=False)
        tagged_file_path = CV_FILES_PATH_LOWER_PUNCT + '/' + file_name + '.tsv'
        write_tagged_content_to_file(tagged_content, tagged_file_path, message=False)
示例#11
0
def main():
	text = open('holmes.txt').read()
	tokens = nltk.wordpunct_tokenize(text)
	charList = []
	for word in tokens:
		for char in word:
			charList.append(char)
	fDistChars = nltk.FreqDist(charList)
	fDistWords = nltk.FreqDist(tokens)
	
	print("Answer to 1A, there are {} character types in the book, namely: \n{}".format(len(fDistChars),sorted(fDistChars)))
	print("\nAnswer to 1B, there are {} word types in the book, namely: \n{}".format(len(fDistWords),sorted(fDistWords)))
	
	bigramChars = nltk.bigrams(charList)
	trigramChars = nltk.trigrams(charList)

	print("\nAnswer to 1C, the 20 most common characters are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(charList), 
		most_common(bigramChars), most_common(trigramChars)))

	bigramWords = nltk.bigrams(tokens)
	trigramWords = nltk.trigrams(tokens)

	print("\nAnswer to 1D, the 20 most common words are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(tokens), 
		most_common(bigramWords), most_common(trigramWords)))
	
	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = BigramCollocationFinder.from_words(tokens)
	scoredPMI = finder.score_ngrams(bigram_measures.pmi)
	scoredCHI = finder.score_ngrams(bigram_measures.chi_sq)
	
	print("\nAnswer to 2, the 20 most likely collocations are:\nPMI:\n{} \nChi's square\n{}" .format(scoredPMI[:20],scoredCHI[:20]))
	
	print("\nSpearmans correlation = {}".format(nltk.metrics.spearman.spearman_correlation(scoredPMI, scoredCHI)))
示例#12
0
def calculate_language_scores(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}.

    :param text: Text to analyze.
    :type text: str

    :return: Dictionary with languages and unique stopwords seen in analyzed text.
    :rtype: dict(str -> int)

    :raises: TypeError
    """
    if not isinstance(text, basestring):
        raise TypeError("Expected basestring, got '%s' instead" % type(text))
    if not text:
        return {}

    languages_ratios = {}

    # Split the text into separate tokens, using natural language punctuation signs.
    tokens = wordpunct_tokenize(text)
    tokenized_words = [word.lower() for word in tokens]

    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(tokenized_words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)  # language "score"

    return languages_ratios
示例#13
0
def descripsToWords(dataFrameOfWords):
   
   # intial parsing
   tokens = nltk.wordpunct_tokenize(' '.join(dataFrameOfWords))
   text = nltk.Text(tokens)
   words = [w.lower() for w in text]
   vocab = sorted(set(words))


   # remove words in removeWords list and punctuation
   removeWords = {'rosario'} # just an example
   filtered_words = [word for word in words if word not in removeWords]
   filtered_words = [w for w in filtered_words if w.isalnum()]

   words = filtered_words

   # check for valid English
   import enchant
   d = enchant.Dict("en_US")
   wordsValid = []
   for w in words:
      if d.check(w):
         wordsValid.append(w) 
      else:
         wordsValid.append(d.suggest(w)[0])	

   words = wordsValid

   return words, vocab
示例#14
0
def get_words(text, min_length = None, max_length = None):
    """
    Parse the given text as natural language and extract words from it.
    Optionally filter the words by minimum and/or maximum length.

    :param text: Text to parse.
    :type text: str

    :param min_length: Minimum length required. Use None for no limit.
    :type min_length: int | None

    :param min_length: Maximum length allowed. Use None for no limit.
    :type min_length: int | None

    :return: Set of unique words extracted from the text.
    :rtype: set(str)
    """

    # Split the text into separate tokens, using natural language
    # punctuation signs. Then filter out by min/max length, and tokens
    # that aren't strictly alphabetic. Finally, convert the words to
    # lowercase form.
    return {
        word.lower() for word in wordpunct_tokenize(text) if
        (
            word.isalpha() and
            (min_length is None or len(word) >= min_length) and
            (max_length is None or len(word) <= max_length)
        )
    }
示例#15
0
def statScore(text,d_index):
	tokens = nltk.wordpunct_tokenize(text)
	val = 0
	for token in tokens:
		w_index = vocabulary.index(token)
		val = val + self.stat_lte[w_index][d_index]
	return val
def translateHinglishTweets(tweets_text):
	counter = 0
	tweets_text_translated = []
	n = len(tweets_text)

	open_file = open("dictionary.pickle", "rb")
	dictionary = pickle.load(open_file)
	open_file.close()

	english_stopwords_set = set(stopwords.words('english'))

	for i in range(n):
		text = tweets_text[i]
		translated_text = ""
		tokens = wordpunct_tokenize(text)
		words = [word.lower() for word in tokens]
		for word in words:
			if word in english_stopwords_set:
				translated_text = translated_text + " " + word
			elif (word in dictionary):
				#print word + "-" + dictionary[word]
				translated_text = translated_text + " " + dictionary[word]
				counter = counter + 1
			else:
				translated_text = translated_text + " " + word
		tweets_text_translated.append(translated_text)

	#print counter
	return tweets_text_translated
示例#17
0
def convert_to_weka(src, des, voc):
    stemmer = nltk.LancasterStemmer()
    word_reg = re.compile('[0-9A-Za-z]+')
    
    des.write('@relation review_rate\n')
    des.write('\n')
    
    for word in voc:
        des.write('@attribute ' + word + ' real\n')
    des.write('@attribute rate {s1,s2,s3,s4,s5}\n')
    des.write('\n')
    
    des.write('@data\n')
    for line in iter(src.readline, ''):
        feature_vector = []
        try:
            rate, title, review = [item.strip() for item in line.split('\t')[5:8]]
        except (IndexError, ValueError):
            continue
        ws = set([])
        for w in nltk.wordpunct_tokenize(title + ' ' + review):
            m = word_reg.match(w)
            if m:
                ws.add(stemmer.stem(m.group(0).lower()))
        for w in voc:
            if w in ws:
                feature_vector.append('1')
            else:
                feature_vector.append('0')
        des.write(','.join(feature_vector) + ',' + 's' + str(int(math.ceil(float(rate)))) + '\n')
        
    return
示例#18
0
def tokenize(text):
    """This handles tokenizing and normalizing everything."""
    return [
        token.lower()
        for token in nltk.wordpunct_tokenize(text)
        if token.isalnum()
    ]
def feature_extractor(data):
    """Extract features from a relation for the classifier."""
    features = dict()
    lmtzr = WordNetLemmatizer()

    h2, h3, paragraph = data
    
    features['h2_' + h2.lower()] = True
    for word in h2.split(' '):
        if word.lower() not in stopwords.words('english') and len(word) > 1:
            features['h2word_' + word.lower()] = True
    features['h_' + h2.lower()] = True
    for word in h2.split(' '):
        if word.lower() not in stopwords.words('english') and len(word) > 1:
            features['hword_' + word.lower()] = True

    if h3 != None:    
        features['h3_' + h3.lower()] = True
        for word in h3.split(' '):
            if word.lower() not in stopwords.words('english') and len(word) > 1:
                features['h3word_' + word.lower()] = True
        features['h_' + h3.lower()] = True
        for word in h3.split(' '):
            if word.lower() not in stopwords.words('english') and len(word) > 1:
                features['hword_' + word.lower()] = True
        
    for word in nltk.wordpunct_tokenize(paragraph):
        if word.lower() not in stopwords.words('english') and len(word) > 1:
            features[word] = True
            features['lower_' + word.lower()] = True
            features['lmtzr_' + lmtzr.lemmatize(word).lower()] = True
    return features
def get_vocabulary(utterances):
    token_list = []
    for utt in utterances:
        utt_content = utt[0]
        token_list += nltk.wordpunct_tokenize(utt_content)
    token_set = set(token_list)
    return token_set
示例#21
0
文件: reader.py 项目: yokeyong/atap
 def words(self):
     """
     Returns a generator of words.
     """
     for sent in self.sents():
         for word in nltk.wordpunct_tokenize(sent):
             yield word
示例#22
0
def write_to_mod_html_file(sentences,locs,tex):
	global count
	g_dic = group_locs_by_sentences(locs)
	ll= []
	for l in g_dic.keys():
		ll.append(l)
	ll.sort(cmp=cmp_by_ind)
	for (x,y) in ll:
		l = g_dic[(x,y)]
		sen = sentences[x]
		slash_n_split = sen.splitlines()
		wds = reg_remove_special_chars.sub(r' ',slash_n_split[y])
		words = nltk.wordpunct_tokenize(wds)
		l.sort(cmp=cmp_by_ind)
		for (h,k) in l:
			words[h] = """<i style="color:red">"""+words[h]
			words[k] = words[k]+'</i>'
		
		words = ' '.join(words)
		slash_n_split[y] = words
		sentences[x] = '\n'.join(slash_n_split)
	t = '\n'.join(sentences)

	f = open('html/%d_mod.html'%count, "w")
	t = reg_replace_slashn.sub(r'<br/>',t)
	f.write(t)
	f.close()
	count +=1
示例#23
0
文件: rf.py 项目: hrishikeshio/insult
def word_feats(words):
    feats={}
    words=words.strip()
    hasbadw=0
    hasyou=0
    sentences=0
    for sentense in re.split(r' *[\.\?!]["\)\]]* *', words):
        sentences+=1
        for word in nltk.wordpunct_tokenize(sentense):
            for curse in badwords:
                if word.lower().endswith(curse.lower()) or word.lower().startswith(curse.lower()):
                    hasbadw+=1
                    break
                    
            if word.lower() in ("you","u","ur","your","urs","urz","yours"):
                hasyou+=1
        

    feats["you"]=hasyou
    feats["badw"]=hasbadw 
    feats["length"]= len(words)
    feats["caps"]=len(re.findall('[A-Z]', words))
    feats["smalls"]=len(re.findall('[a-z]', words))
    feats["sentences"]=sentences
    feats["capsratio"]=float(feats["caps"])/len(words)
    featslist=[]
    for k,v in feats.iteritems():
        featslist.append(v)
    return featslist
def requirementAnalysis(fileArchimate=None):

    if fileArchimate is None:
        fileArchimate = u"/Users/morrj140/Documents/SolutionEngineering/Archimate Models/DVC v38.archimate"

    al = ArchiLib(fileArchimate)

    conceptsFile = fileConceptsRequirements

    searchTypes = list()
    searchTypes.append(u"archimate:Requirement")
    nl = al.getTypeNodes(searchTypes)

    logger.info(u"Find Words in Requirements...")
    concepts = Concepts(u"Requirement", u"Requirements")
    n = 0
    for sentence in nl:
        n += 1
        logger.debug(u"%s" % sentence)

        c = concepts.addConceptKeyType(u"Document" + str(n), u"Document")
        d = c.addConceptKeyType(sentence, u"Sentence" + str(n))

        if True and sentence is not None:
            cleanSentence = ' '.join([word for word in sentence.split(u" ") if word not in stop])
            for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)):
                if len(word) > 1 and pos[0] == u"N":
                    e = d.addConceptKeyType(word, u"Word")
                    f = e.addConceptKeyType(pos, u"POS")

    Concepts.saveConcepts(concepts, conceptsFile)
    logger.info(u"Saved : %s" % conceptsFile)

    chunks = Chunks(concepts)
    chunks.createChunks()
示例#25
0
文件: Parser.py 项目: mmmarchman/CITA
    def parse(self):
        # Creates a single list from list_strings
        self.word_list.append(' '.join(self.list_strings))

        # Divides the single list string into substrings representing a word
        self.word_list = nltk.sent_tokenize(str(self.word_list[0]))

        # Separates punctuation
        for sentence in self.word_list:
            self.word_list = nltk.wordpunct_tokenize(sentence.lower())

        # Remove all stop words in big_string
        self.word_list = [w for w in self.word_list if w not in self.stop_words]

        # print "List with stopwords removed: " + str(self.word_list)

        stemmer = nltk.PorterStemmer()

        # Stemmer is used to normalize adjective, adverbs, and verbs as well as making sure
        # that plural and singular words become the same
        self.word_list = [stemmer.stem(word) for word in self.word_list]

        # Removes the unicode formatting produced by the stemmer
        self.word_list = [str(word) for word in self.word_list]

        # Creates a frequency distribution based on words in self.word_list
        fdist = nltk.FreqDist(self.word_list)
        fdist = fdist.most_common(self.top_n)

        return fdist
示例#26
0
def findWinners(tweeters, categories):
	awardResult = {}
	THRESHOLD = 200

	awardPat = re.compile("best .*",re.IGNORECASE)
	winnerPat = re.compile(".*win.*",re.IGNORECASE)
	for twtr in tweeters:
		tweets = twtr.tweets
		for tweet in tweets:
			if winnerPat.match(tweet.text):
				cleanTweet = sanitizeTweet(tweet.text)
				award = awardPat.search(cleanTweet)

				if award:
					properNoun =[]
					firstHalfOfTweet = re.search("(?i).*(?=win)",cleanTweet)
					tokenizedText = nltk.wordpunct_tokenize(firstHalfOfTweet.group())

					if tokenizedText:
						properNoun = extractProperNouns(tokenizedText)
						award = sanitizeAwardName(award.group())
						mostSimilarAward = findSimilarCategory(award, categories)
						
						if mostSimilarAward in awardResult:
							awardResult[mostSimilarAward] +=properNoun
						else:
							awardResult[mostSimilarAward] = properNoun
		THRESHOLD = THRESHOLD -1
		if THRESHOLD<1:
			print("THRESHOLD MET")
			break

	sanitizeAwardResult(awardResult)
示例#27
0
def findBestWorstDress(tweeters):
	possibleBestDress = []
	possibleWorstDress = []
	bestDressPat = re.compile(".*best dress.*",re.IGNORECASE)
	worstDressPat = re.compile(".*worst dress.*",re.IGNORECASE)
	pat = ""
	for twtr in tweeters:
		for twt in twtr.tweets:
			properNoun =[]
			if bestDressPat.match(twt.text):
				pat = "best"
			elif worstDressPat.match(twt.text):
				pat = "worst"
			else:
				continue
			firstHalfOfTweet = re.search("(?i).*(?=%s)" % pat,twt.text)
			tokenizedText = nltk.wordpunct_tokenize(firstHalfOfTweet.group())

			if tokenizedText:
				properNoun = extractProperNouns(tokenizedText)
				for pn in properNoun:
					if len(pn.split())==2 :
						if pat == 'best':
							possibleBestDress.append(pn)
						else:
							possibleWorstDress.append(pn)

	bestData = collections.Counter(possibleBestDress)
	worstData = collections.Counter(possibleWorstDress)
	print("\n\nList of Best Dressed:\n========================")
	for host in bestData.most_common()[0:5]:
		print(host[0])
	print("\n\nList of Worst Dressed:\n========================")
	for host in worstData.most_common()[0:5]:
		print(host[0])
def _calculate_languages_ratios(text): 
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios
示例#29
0
def get_bigram_dict(filename):
    input_file = codecs.open(filename, 'r', encoding='utf8')
    content = input_file.read()
    dic = {}
    tokens = nltk.wordpunct_tokenize(content)
    finder = BigramCollocationFinder.from_words(tokens)
    return finder.ngram_fd
示例#30
0
def findPresenters(twtrs):
	possiblePresenters = {}
	patterns = ["presenting an award", "presenting for best", "presenting best", "presents .* best", "presenting at the", "presents at the", "is presenting"]

	for twtr in twtrs:
		for twt in twtr.tweets:
			text = twt.text
			for pattern in patterns:
				rePat = re.compile(".* %s .*" % pattern, re.IGNORECASE)
				if rePat.match(text):
					cleanText = re.search("(?i).*(?=%s)" % pattern, text).group()
					cleanText = sanitizeTweetForPresenters(cleanText)
					if cleanText:
						properNouns = extractProperNouns(nltk.wordpunct_tokenize(cleanText))
						
						for properNoun in properNouns:
							properNoun = sanitizeSlang(properNoun)
							if len(properNoun.split()) >= 2 and not properNoun.isupper():
								if properNoun not in possiblePresenters:
									possiblePresenters[properNoun] = twtr.score
								else:
									possiblePresenters[properNoun] = possiblePresenters[properNoun] + twtr.score
					break

	sorted_presenters = OrderedDict(sorted(possiblePresenters.items(), key=lambda possiblePresenters: possiblePresenters[1], reverse=True))

	print("\n\nList of Presenters:\n========================")
	for presenter in sorted_presenters.keys():
		if sorted_presenters[presenter] > 0:
			print(presenter, sorted_presenters[presenter])
示例#31
0
def classify(url):
    try:
        url = url.replace("*", "/")
        complete_data = []
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html5lib')
        for script in soup(["script", "style"]):
            script.extract()
        raw_data = soup.get_text()
        words = set(nltk.corpus.words.words())
        raw_data = " ".join(w for w in nltk.wordpunct_tokenize(raw_data)
                            if w.lower() in words)

        nlp = spacy.load("en")
        file_text = nlp(raw_data)

        words = [
            token.lemma_ for token in file_text if not token.is_punct
            and not token.like_num and not token.is_space and not token.is_stop
        ]
        strip_data = [
            token.lower() for token in words
            if not len(token.strip()) < 2 and not len(token.strip()) > 15
        ]
        if len(strip_data) > 30:
            frequencies_words = FreqDist(strip_data).most_common(100)
            words_most_frequent = [word[0] for word in frequencies_words]
            untokenize_data = TreebankWordDetokenizer().detokenize(strip_data)
            complete_data.append(untokenize_data)

            vocabalary = pickle.load(open(configuration.vocabulary_path, "rb"))
            data = vocabalary.transform(complete_data)
            with open(configuration.classifier_model_path, 'rb') as fid:
                model_load = cPickle.load(fid)
            y_predict = model_load.predict(data)
            array_percentage = model_load.predict_proba(data)
            array_percentage = array_percentage * 100
            print(array_percentage[:].round(2))

            file = open(configuration.website_category_path, "r+")
            output = file.read()
            dic = json.loads(output)
            file.close()
            target_dict = {}
            category_url_list = []
            for key, value in dic.items():
                target_dict[int(key)] = value
                category_url_list.append(value)
            print(target_dict)
            print(category_url_list)

            result_percent = array_percentage[:, y_predict[0] - 1][0]
            result_percent = result_percent.round(2)
            if result_percent > 30:
                if len(strip_data) < 500:
                    return (
                        str(target_dict[y_predict[0]]),
                        "Note: Classification result may be inaccurate due to minimal content in the website and it's accuracy is "
                        + str(result_percent) + " %",
                        "You can add you own category for your website. If the name of the category peresent in the below list use that name. Or else create your own in this format http://127.0.0.1:8000/(url)?category=(category). For exmaple: http://127.0.0.1:8000/https:**www.mdpi.com*journal*agriculture?category=agriculture",
                        category_url_list, words_most_frequent)
                else:
                    return (
                        target_dict[y_predict[0]],
                        "Accuracy of the classification is " +
                        str(result_percent) + " %",
                        "You can add you own category for your website. If the name of the category peresent in the below list use that name. Or else create your own in this format http://127.0.0.1:8000/(url)?category=(category). For exmaple: http://127.0.0.1:8000/https:**www.mdpi.com*journal*agriculture?category=agriculture",
                        category_url_list, words_most_frequent)
            else:
                return (
                    "Given website is not related to space, job portal, adult, animals, news category",
                    "May be it is related to " +
                    str(target_dict[y_predict[0]]) + " and it's accuracy is " +
                    str(result_percent) + " %",
                    "You can add you own category for your website. If the name of the category peresent in the below list use that name. Or else create your own in this format http://127.0.0.1:8000/(url)?category=(category). For exmaple: http://127.0.0.1:8000/https:**www.mdpi.com*journal*agriculture?category=agriculture",
                    category_url_list, words_most_frequent)

        else:
            return (
                "Can't extract content from the website",
                "Site may be invalid or unavailable or having very few content",
                "Not available", "Not available", "Not available")

    except Exception as e:
        return ("Facing error while parsing the website", e, "Not available",
                "Not available", "Not available")

#编码解码
type(html)
html=html.decode()
type(html)


# In[34]:


#清洗语料获得纯文本
#raw=nltk.clean_html(html)-不成了,用bs4提供的函数就好
# http://www.crummy.com/software/BeautifulSoup
raw=BeautifulSoup(html).get_text()
tokens=nltk.wordpunct_tokenize(raw)
#print(tokens)
#print(type(tokens))
raw1=raw[750:23506]
#print(raw1)
text=nltk.Text(tokens)
words=[w.lower()for w in text]
vocab = sorted(set(words))
print('word:',vocab,len(words),'++',len(vocab))
print('token:',tokens,len(tokens))


# In[19]:


tokens=tokens[96:399]
示例#33
0
import nltk
from nltk import word_tokenize, wordpunct_tokenize
from nltk.util import ngrams
classifier = nltk.data.load("classifiers/plusminus.pickle")
openfile = open('hello.txt', 'r')
twe = openfile.read()
words = wordpunct_tokenize(twe)
feats = dict([(word, True) for word in words + ngrams(words, 2)])
xa = classifier.classify(feats)
if xa == 'pos':
    classifier = nltk.data.load("classifiers/happyfunny.pickle")
    posi = classifier.classify(feats)
    if posi == 'pos':
        openfile = open('mood.js', 'w')
        openfile.write('var md=1;')
    elif posi == 'neg':
        openfile = open('mood.js', 'w')
        openfile.write('var md=2;')
elif xa == 'neg':
    classifier = nltk.data.load("classifiers/sadangry.pickle")
    posi = classifier.classify(feats)
    if posi == 'pos':
        openfile = open('mood.js', 'w')
        openfile.write('var md=3;')
    elif posi == 'neg':
        openfile = open('mood.js', 'w')
        openfile.write('var md=4;')
示例#34
0
def tokenize_text(input_str: str = "") -> list:
    return nltk.wordpunct_tokenize(input_str)
示例#35
0
from nltk import wordpunct_tokenize, pos_tag, ne_chunk
import nltk

nltk.download('maxent_ne_chunker')
nltk.download('words')

f = open('input.txt', 'r', encoding='utf-8')
input = f.read()

stokens = nltk.sent_tokenize(input)

for i in stokens:
    print(ne_chunk(pos_tag(wordpunct_tokenize(i))))
示例#36
0
stopwords = [
    'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
    'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am',
    'is', 'are', 'be', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
    'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
    'but', 'if', 'or', 'as', 'of', 'at', 'by', 'for', 'with', 'about', 'above',
    'below', 'into', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
    'over', 'under', 'then', 'here', 'there', 'when', 'where', 'all', 'any',
    'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only',
    'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
    'don', 'should', 'i', 'me', 'my', 'myself', 'you', 'your', 'yourself',
    'we', 'us', 'ourselves', 'ourself', 'll', 've', 'd', 're', 'm'
]

#preprocessing
tokens = nltk.wordpunct_tokenize(engltext.decode('utf8'))
text = nltk.Text(tokens)
englwords = [w.lower() for w in text if w.isalpha()]
englwords = [w for w in englwords if w not in stopwords]
englwords = nltk.pos_tag(englwords)

#lemmatization of input text
wordnet_lemmatizer = WordNetLemmatizer()

engltextlemmas = []
for w, pos in englwords:
    if get_wordnet_pos(pos):
        engltextlemmas.append(
            wordnet_lemmatizer.lemmatize(w, get_wordnet_pos(pos)))
    else:
        engltextlemmas.append(wordnet_lemmatizer.lemmatize(w))
示例#37
0
 def stemWords(text):
     tokens = nltk.wordpunct_tokenize(text)
     return ' '.join([ self.stemmer.stem(t).lower() for t in tokens ])
示例#38
0
def rmStopWords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.wordpunct_tokenize(text)
    nostop = [t for t in tokens if t not in stopwords]
    return ' '.join(nostop)
示例#39
0
    def is_english(self, text):

        text = text.lower()
        words = set(nltk.wordpunct_tokenize(text))
        return len(words & self.ENGLISH_STOPWORDS) > len(words & self.NON_ENGLISH_STOPWORDS)
示例#40
0
from nltk import ngrams, ne_chunk, wordpunct_tokenize, pos_tag

with open('output.txt', 'r', encoding='utf-8') as f:
    raw = f.read()
#Tokenization
wtokens = nltk.word_tokenize(raw)
words = [word.lower() for word in wtokens if word.isalpha()]
print(words)
#Adding tag
print(nltk.pos_tag(words))
lStem = LancasterStemmer()
print(
    "Lancaster Stemming :----------------------------------------------------- \n"
)
for tok in words:
    print(lStem.stem(str(tok)))
lemmatizer = WordNetLemmatizer()
print(
    "Lemmatization ------------------------------------------------------------:\n"
)
for tok in words:
    print(lemmatizer.lemmatize(str(tok)))
print("Trigrams --------------------------------------------:\n")
trigram = []
x = 0

trigram.append(list(ngrams(words, 3)))
print(trigram)
print("NER-------------------------------------\n")
print("NER : \n", ne_chunk(pos_tag(wordpunct_tokenize(str(words)))))
示例#41
0
def do_process(file_list):
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    docs = {}
    i = 1
    for file in file_list:

        lowercased = file.lower()

        #words=re.findall(r'[a-zA-Z]+[\w\']+',lowercased)
        words = nltk.wordpunct_tokenize(lowercased)
        # words = [w for w in words if (w.isalpha() and len(w) != 1 and w not in stop_words)]
        words = [w for w in words if (w.isalpha() and len(w) != 1)]

        words_set = sorted(set(words))

        word_dic = {}

        for word in words_set:
            word_dic[word] = words.count(word)

        word_sorted = collections.OrderedDict(sorted(word_dic.items()))
        # word_sorted = sorted(word_hashmap.items(), key=operator.itemgetter(0))

        # headers=['term', 'frequency']
        # print(tabulate(word_hashmap_sorted, headers=headers))

        doc_name = "document%d" % i + ":"
        i += 1
        docs[doc_name] = word_sorted

        # print("\n## "+doc_name)
        # print("## full length %d" %  len(words))
        # print("## set length %d" % len(words_set)+"\n")

    f = open('answers.txt', 'w+')
    # index 140323F
    k = 23
    docs = collections.OrderedDict(sorted(docs.items()))

    f.write("140323F\n")

    f.write('1\n')
    for key, value in docs.items():
        s = key + "%d" % len(value)
        f.write(s + "\n")
    f.write("\n")

    f.write('2\n')
    for key, value in docs.items():
        kth_key = list(value)[k]
        s = key + "" + kth_key + ",%.3f" % tf_w(value[kth_key])
        f.write(s + "\n")
    f.write("\n")

    f.write('3\n')
    for key, value in docs.items():
        kth_key = list(value)[k]
        s = key + "" + kth_key + ",%.3f" % idf(kth_key, docs)
        f.write(s + "\n")
    f.write("\n")

    f.write('4\n')
    for key, value in docs.items():
        sorted_tf_idf_tuple_list = sorted_by_tf_idf(value, docs)
        i = 1
        s = key
        for item in sorted_tf_idf_tuple_list:
            if i < 11:

                s += item[0]
                if i != 10:
                    s += ","
                i += 1
                # print(item[0],item[1])
            else:
                continue
        f.write(s + "\n")
    f.close()
    f = open('answers.txt', 'r')
    written = f.read()
    f.close()
    print("wrote to file\n\n" + written)
示例#42
0
def dummy_reader():
    article = dummy_articles().next()
    reader = Mock()
    nltkwrapper.PlaintextCorpusReader = Mock(return_value=reader)
    reader.words.return_value = nltk.wordpunct_tokenize(article)
    reader.sents.return_value = [nltk.wordpunct_tokenize(article) for sent in nltk.sent_tokenize(article)]
示例#43
0
def tokenizator(html):
    page_content = BeautifulSoup(html).get_text()
    result = list(nltk.wordpunct_tokenize(page_content))
    result = minus_znak_prep(result)
    result = list(filter(minus_incorrect_sym, result))
    return result
示例#44
0
data = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold."""

cut_tokens = nltk.wordpunct_tokenize(data)
tokens = list()
stop_words.add('.')
stop_words.add('?')
stop_words.add(')')
stop_words.add(').')
stop_words.add('(')
stop_words.add('.(')
stop_words.add(',')
stop_words.add('/')
stop_words.add('-')
stop_words.add('_')
stop_words.add('+')
stop_words.add('$')
stop_words.add('&')
stop_words.add('!')
示例#45
0
def remove_non_english_words(text):
    words = set(nltk.corpus.words.words())
    result = " ".join(w for w in nltk.wordpunct_tokenize(text)
                      if w.lower() in words or not w.isalpha())
    return result
示例#46
0
def has_blog_candidate(description):
    words = set(nltk.wordpunct_tokenize(description))
    return len(blog_keywords & words) > 0
def tokenize(data):
    return nltk.wordpunct_tokenize(data)
示例#48
0
 def extract(self, text):
     tokens = nltk.wordpunct_tokenize(text)
     result = []
     for t in tokens:
         result.append((0, 0, self._stemmer.stem(t), 1.0))
     return result
示例#49
0
文件: hw3.py 项目: MosesZ/NLP
    for row in reader:
        documents.append(row[3])

documents = documents[:100]
filtered_text = []

stop_words = stopwords.words("russian")
stop_words.extend(['rt'])

ps = PorterStemmer()
wnl = WordNetLemmatizer()

for d in documents:

    d = strip_all_entities(strip_links(d.lower()))
    tokens = nltk.wordpunct_tokenize(d)
    filtered_tokens = [
        w for w in tokens if (w not in stop_words and not di.check(w))
    ]
    stemm_tokens = [(ps.stem(w)) for w in filtered_tokens]
    filtered_text.append(stemm_tokens)

words = []
words.append("")
words_count = []
words_count.append(0)

for tokens in filtered_text:
    for token in tokens:
        if token not in words:
            words.append(token)
示例#50
0
    def get_LDA(self):
        articles = self.petitionDocs
        cur=articles.find({"GT"}, no_cursor_timeout=True)
        cur1=cur.sort('user_id', 1)
        cursor=cur1.limit(int(self.Eighty))
        doc_complete=[]
        doc_clean=[]
        doc_completeT=[]
        doc_cleanT=[]
        tList=[]
        tListT = []
        count=0
        exclude = set(string.punctuation)
        stop = set(stopwords.words('english'))
        lines = open("stop3").read().splitlines()
        for word in lines:
            print word
            stop.add(word)
        mysqlStop = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all",
                     "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst",
                     "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway",
                     "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes",
                     "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides",
                     "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co",
                     "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due",
                     "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc",
                     "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen",
                     "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found",
                     "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have",
                     "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
                     "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed",
                     "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least",
                     "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more",
                     "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither",
                     "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing",
                     "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other",
                     "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
                     "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious",
                     "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some",
                     "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system",
                     "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there",
                     "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin",
                     "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to",
                     "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until",
                     "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
                     "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon",
                     "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose",
                     "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself",
                     "yourselves", "the"]
        for word in mysqlStop:
            stop.add(word)

        for p in cursor:
            #strip HTML tags from tweet
            strip = strip_tags(p['title'].encode('UTF-8')+" "+p['overview'].encode('UTF-8'))
            doc_complete.append(strip)
            strip1=TextCleaning.cleanURLEmailMention(self,strip)
            words = set(nltk.corpus.words.words())
            EnCleanedDoc=" ".join(w for w in nltk.wordpunct_tokenize(strip1) \
                 if w.lower() in words or not w.isalpha())
            EnCleanedDoc = unicode(EnCleanedDoc, errors='ignore')
            text = nltk.word_tokenize(EnCleanedDoc)
            posTag = nltk.pos_tag(text)
            countAdjAdv=0
            countNounVerb=0
            countAdjAdv=0
            countNounVerb=0
            for cat in posTag:
                if cat[1] in verbNoun:
                    countNounVerb+=1
                elif cat[1] in adverbAdjectives:
                    countAdjAdv+=1
            if countNounVerb ==0:
                expressivness=0
            else:
                expressivness=float(countAdjAdv) / float(countNounVerb)
            cleaneddoc=TextCleaning.clean(self,EnCleanedDoc,stop,exclude)
            doc_clean.append(cleaneddoc)
            tList.append(p['petition_id'])
            count+=1
            self.petitionDocs.update({"petition_id": p['petition_id']}, {"$set": {"LDA_cleanedDescription": EnCleanedDoc,"expressivness":expressivness}}, False, False)


        # list for tokenized documents in loop
        doc_tok = [doc.split() for doc in doc_clean]
        # Creating the term dictionary of our courpus, where every unique term is assigned an index.
        dictionary = corpora.Dictionary(doc_tok)
        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_tok]


        # Prepare the testing dataset
        countT=0
        cur=articles.find({}, no_cursor_timeout=True)
        cur1=cur.sort('user_id', -1)
        cursor=cur1.limit(int(self.Twenty))
        for p in cursor:
            #strip HTML tags from tweet
            strip = strip_tags(p['title'].encode('UTF-8')+" "+p['overview'].encode('UTF-8'))
            doc_completeT.append(strip)
            strip1=TextCleaning.cleanURLEmailMention(self,strip)
            words = set(nltk.corpus.words.words())
            EnCleanedDoc=" ".join(w for w in nltk.wordpunct_tokenize(strip1) \
                 if w.lower() in words or not w.isalpha())
            EnCleanedDoc = unicode(EnCleanedDoc, errors='ignore')
            text = nltk.word_tokenize(EnCleanedDoc)
            posTag = nltk.pos_tag(text)
            countAdjAdv=0
            countNounVerb=0
            for cat in posTag:
                if cat[1] in verbNoun:
                    countNounVerb+=1
                elif cat[1] in adverbAdjectives:
                    countAdjAdv+=1
            if countNounVerb ==0:
                expressivness=0
            else:
                expressivness=float(countAdjAdv) / float(countNounVerb)
            cleaneddoc=TextCleaning.clean(self,EnCleanedDoc,stop,exclude)
            doc_cleanT.append(cleaneddoc)
            tListT.append(p['petition_id'])
            countT+=1
            self.petitionDocs.update({"petition_id": p['petition_id']}, {"$set": {"LDA_cleanedDescription": EnCleanedDoc,"expressivness":expressivness}}, False, False)

        # list for tokenized documents in loop
        doc_tokT = [doc.split() for doc in doc_cleanT]
        # Creating the term dictionary of our courpus, where every unique term is assigned an index.
        dictionaryT = corpora.Dictionary(doc_tokT)
        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        doc_term_matrixT = [dictionaryT.doc2bow(doc) for doc in doc_tokT]


        # Creating the object for LDA model using gensim library
        Lda = gensim.models.ldamodel.LdaModel

        # Running LDA with different number of topics and getting the lowest preplexity
        topics=[10,30,40,50,80,100]
        perplexity=[]
        for top in topics:
            # Running and Trainign LDA model on the document term matrix.
            ldamodel = Lda(doc_term_matrix, num_topics=top, id2word=dictionary, passes=50)
            LDAOut=ldamodel.print_topics(num_topics=top, num_words=10)
            perplex = Lda.bound(ldamodel, doc_term_matrixT)
            with open('LDAout'+str(top)+'topics.txt', 'w') as f:
                print >> f, "-----------Run for "+str(top)+" topics ---------------------------------------------"
                print >> f, LDAOut
                print >> f, "-------------------------------------------------------------------------"
                print >> f, "perplexity ="+str(perplex)
                print >> f, "-----------Run for "+str(top)+" topics ---------------------------------------------"
            f.close()
            print "-----------Run for " + str(top) + " topics ---------------------------------------------"
            print LDAOut
            print  "-------------------------------------------------------------------------"
            print "perplexity ="+str(perplex)
            print "-----------Run for "+str(top)+" topics ---------------------------------------------"
            # printing LDA results with word count parameter for each topic
            perplexity.append(perplex)

            plen=count
            for it in range(0,plen):
                print 'saving tweets topics probability distribution for topic '+str(top)
                try:
                    post = {}
                    # Prepare LDA topic scores for each petition
                    post['LDA_topic'+str(top)] = ldamodel[doc_term_matrix[it]]
                    print '' \
                          ''
                    # Update topic score in database
                    self.petitionDocs.update({'petition_id': tList[it]}, {"$set": post}, upsert=False)
                    str1 = ''.join(str(post['LDA_topic'+str(top)]))
                    print str(tList[it]) + ' has topics ' + str1
                except Exception as e:
                    print 'error in setting LDA for tweet: ' + str(tList[it])
                    print(e)

            # testing
            plen = countT
            for it in range(0, plen):
                print 'saving tweets topics probability distribution for topic ' + str(top)
                try:
                    post = {}
                    # Prepare LDA topic scores for each petition
                    post['LDA_topic' + str(top)] = ldamodel[doc_term_matrixT[it]]
                    print '' \
                          ''
                    # Update topic score in database
                    self.petitionDocs.update({'petition_id': tListT[it]}, {"$set": post}, upsert=False)
                    str1 = ''.join(str(post['LDA_topic' + str(top)]))
                    print str(tListT[it]) + ' has topics ' + str1
                except Exception as e:
                    print 'error in setting LDA for tweet: ' + str(tListT[it])
                    print(e)
        # Prepare the data
        xArr = np.array(topics)
        yArr = np.array(perplexity)
        # Plot the data

        fig = plt.figure()
        plt.plot(xArr, yArr, label='linear')
        fig.suptitle('Held-out per-word perplexity', fontsize=20)
        plt.xlabel('Number of Topics', fontsize=16)
        plt.ylabel('Perplexity', fontsize=16)

        # Show the plot
        plt.show()
        fig.savefig('perp')
示例#51
0
文件: app.py 项目: Kolya59/freq

# Write tag's fields to the output
def save(output, tag):
    output.write('{0} {1}\n'.format(tag[0], tag[1]))


if __name__ == '__main__':
    # Read text
    f = open('./data/input.txt', 'r')
    source = f.read()
    # Create stopwords dictionary from nltk
    stop_words = set(stopwords.words('english'))

    # Tokenize text
    tokens = nltk.wordpunct_tokenize(source)
    # Filter stop words
    tokens = [
        i for i in tokens
        if (i not in string.punctuation and i.lower() not in stop_words)
    ]

    # Analyze each word
    morph = pymorphy2.MorphAnalyzer()
    # Convert tokens to tags
    tags = [morph.parse(i) for i in tokens]
    # Create ordered dict key=tag, value=freq
    ordered = collections.OrderedDict(
        sorted(
            # Convert each token to normal form and count frequency
            collections.Counter([t[0].normal_form for t in tags]).items(),
示例#52
0
    'http://feeds.reuters.com/reuters/technologyNews',
    'http://www.tweaktown.com/news-feed/'
]

import feedparser
import nltk
from bs4 import BeautifulSoup

corpus = []
titles = []
ct = -1
for feed in feeds:
    d = feedparser.parse(feed)
    for e in d['entries']:
        soup = BeautifulSoup(e['description'])
        words = nltk.wordpunct_tokenize(soup.get_text())
        words.extend(nltk.wordpunct_tokenize(e['title']))
        lowerwords = [x.lower() for x in words if len(x) > 1]
        ct += 1
        print ct, "TITLE", e['title']
        corpus.append(lowerwords)
        titles.append(e['title'])


import math
from operator import itemgetter


def freq(word, document): return document.count(word)

示例#53
0
body = soup.find('div', {'class': 'mw-parser-output'})
file2.write(str(body.text))

with open('input.txt', 'r', encoding="utf8") as inputData:
    TextData = inputData.read().replace('\n', '')
'''with open('input.txt',encoding='utf8') as data:
    text= data.read().strip()'''

tokens = word_tokenize(TextData)
pos = nltk.pos_tag(tokens)
print(tokens[1:10])
print(pos[1:10])
from nltk.stem import PorterStemmer
ps = PorterStemmer()
for w in tokens:
    print(w, ":", ps.stem(w))

from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
for m in tokens:
    print(m, ":", lem.lemmatize(m))

from nltk import ngrams
trigram = ngrams(TextData.split(), 3)
for gram in trigram:
    print(gram)
print(str(trigram))

from nltk import wordpunct_tokenize, pos_tag, ne_chunk
print(ne_chunk(pos_tag(wordpunct_tokenize(TextData))))
示例#54
0
 def tokenization(self):
     self.__sentence = nltk.wordpunct_tokenize(self.__sentence)
示例#55
0
 def remove_short_words(self, str, length=3):
     """Removes any word with length >= 3 (default) or user defined"""
     return " ".join(w for w in nltk.wordpunct_tokenize(str)
                     if len(w) >= length)
示例#56
0
 def remove_stop_words(self, str):
     return " ".join(w for w in nltk.wordpunct_tokenize(str) \
                     if w not in self.stpwrds)
示例#57
0
import os
import pip
import nltk
from nltk.corpus import wordnet as wn
from collections import defaultdict

# Install googlesearch if not yet installed
if 'googlesearch' not in map(lambda x: x.project_name,
                             pip.get_installed_distributions()):
    os.system('sudo pip install googlesearch')
from googlesearch import GoogleSearch as gs

googleResults = lambda key: gs('%s and' % key).top_results()
sentenceToWords = lambda sent: [w.lower() for w in\
    nltk.Text(nltk.wordpunct_tokenize(sent))\
    if w.isalpha() and len(w) > 1]

# Get the adjectives
adjs = [synset.lemma_names()[0] for synset in list(wn.all_synsets(wn.ADJ))]
'''
Get the words that follows key and
e.g. interesting as in weird and interesting with weird as key
'''


def similarWords(key):
    rtn = []
    query = '%s and ' % (key)
    search = gs(query).top_results()
    for result in search:
        content = sentenceToWords(result['content'])
for item in tribute_text:
    item = remove_tags(str(item))
    item = item.replace('\n', ' ')
    cleaned_tribute_text.append(item)
    
# print(cleaned_tribute_text)


english_story_text = []
english_tribute_text = []

words = set(nltk.corpus.words.words())

for item in cleaned_story_text:
    # print(item)
    curr = " ".join(w for w in nltk.wordpunct_tokenize(item) \
                 if w.lower() in words or not w.isalpha())
    english_story_text.append(curr)

print(cleaned_story_text[0:10])
print(english_story_text[0:10])

stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
示例#59
0
    def get_nnp_ngrams(self, original_text, highlight=5, minsize=0):

        keywords_by_postion = []
        minsize = minsize - 1
        if minsize < 0:
            minsize = 0
        tokens = nltk.wordpunct_tokenize(original_text)
        tagged = nltk.word_tokenize(original_text)

        i = 0
        for t in tagged:
            tagged[i] = str(t)
            i = i + 1

        tagged = nltk.pos_tag(tokens)
        doc_length = len(tokens)
        counter = 0
        counter2 = 0
        if highlight == 0:
            concated_test = doc_length  # This is set to doc_length but could be anything recommend 3.
        else:
            concated_test = highlight
        list_of_NNPs = []

        while counter < (doc_length - 1):
            while counter2 < concated_test:
                counter2 = counter2 + 1
                counter3 = 0

                temp_array = []
                all_nnp = True
                while counter3 < counter2:
                    if counter < (doc_length - counter3):
                        temp_array.append(tokens[counter + counter3])
                        if tagged[counter + counter3][1] != 'NNP':
                            all_nnp = False
                    counter3 = counter3 + 1

                counter3 = 0
                if all_nnp == True:
                    if (len(temp_array) > minsize):
                        list_of_NNPs.append(temp_array)

            counter2 = 0
            counter = counter + 1

        for l in list_of_NNPs:
            str1 = ' '.join(l)
            if len(str1) < 3 or (not str1.isalnum()):
                list_of_NNPs.remove(l)

        unique_NNPs = list(
            list_of_NNPs
            for list_of_NNPs, _ in itertools.groupby(list_of_NNPs))

        #discard punctuations
        unique_NNPs = self.discard_words_after_punct(unique_NNPs)

        unique_NNPs.sort()
        unique_NNPs_final = list(
            unique_NNPs for unique_NNPs, _ in itertools.groupby(unique_NNPs))
        unique_NNPs_final.sort()

        #filter list to get max lenght n grams
        unique_NNPs_final = self.get_maxlength_ngram(unique_NNPs_final)
        unique_NNPs_final = self.remove_stopwords(unique_NNPs_final)
        unique_NNPs_final.sort()  ##for removing empty ngrams
        unique_NNPs_final = list(
            unique_NNPs_final
            for unique_NNPs_final, _ in itertools.groupby(unique_NNPs_final))
        if not unique_NNPs_final[0]:
            del unique_NNPs_final[0:1]
            #print unique_NNPs_final
        print "Keywords:"
        print unique_NNPs_final

        if len(tokens) > 200:
            for kw in unique_NNPs_final:
                print "kw[0]::" + kw[0]
                indx_NNP = tokens.index(kw[0])
                #indx_NNP = indx[0]
                i = 1
                flag = 0
                for i in range(len(kw)):
                    if tokens[indx_NNP + i] <> kw[i]:
                        flag = 1
                        break
                    i = i + 1
                if flag == 0:
                    if indx_NNP > 0 and indx_NNP < 200:
                        keywords_by_postion.append(kw)
            print "filtered Keywords:"
            print keywords_by_postion
            unique_NNPs_final = keywords_by_postion
            for ngram in unique_NNPs_final:

                for i in ngram:
                    if len(i) == 1:
                        ngram.remove(i)
                if len(ngram) == 0:
                    unique_NNPs_final.remove(ngram)

        return unique_NNPs_final
示例#60
0
 def tagged_sents(self):
     for sent in self.sents():
         yield nltk.pos_tag(nltk.wordpunct_tokenize(sent))