示例#1
0
	def getDomainUnigram(self, directory = None):		
		collocations = set()  #collocation items
		ewordlists = list() #list of lists of words
		
		#extract words from essays
		if directory is not None:
			doclist = os.listdir(directory)
			for essay in doclist:
				dir_essay  = directory+'/'+essay
				etext = open(dir_essay,'r').read()
				tokens = nltk.wordpunct_tokenize(etext)
				tokens = [word.lower() for word in tokens]
				#stemming
				if self._stemoption ==True:
					st = PorterStemmer()
					tokens = [st.stem(t) for t in tokens]
				
				#extract the collocation for the given essay
				e_bigram = set(Mytext(tokens).collocations())
				collocations = collocations | e_bigram
				ewordlists.append(tokens)
				
		else: # using the mapped essay to calcuate the candidate bigrams
			#need to call mapessay fuction first
			for ins in self._data:
				if ins['essay'] is not None:
					etext = open(ins['essay'],'r').read()
					tokens = nltk.wordpunct_tokenize(etext)
					tokens = [word.lower() for word in tokens]
					#stemming
					if self._stemoption ==True:
						st = PorterStemmer()
						tokens = [st.stem(t) for t in tokens]
				
					#extract the collocation for the given essay
					e_bigram = set(Mytext(tokens).collocations())
					collocations = collocations | e_bigram
					ewordlists.append(tokens)
		
		#get collection of all essays under the specified directory / associated essays
		collection_text = TextCollection(ewordlists)
		
		itemlist = list()
		for (a, b) in collocations:
			itemlist.append(a)
			itemlist.append(b)
			
		itemlist = list(set(itemlist))	
		
		word_idf = []
		for i in range(len(itemlist)):
			word_idf.append((collection_text.idf(itemlist[i]), itemlist[i]))	
		
		word_idf = sorted(word_idf, key = operator.itemgetter(0))
		ave = 0
		if len(word_idf)!=0:
			ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf)
			
		wlist =  [j for (i, j) in word_idf if i<ave]				
		return wlist
示例#2
0
def search(dictionary_file, postings_file, query_file, output_file):
    """ Entry point to the program """

    stemmer = PorterStemmer()
    with open(dictionary_file, "rb") as dfile:
        dictionary = pickle.loads(dfile.read())

    with open(query_file, "rb") as qfile:
        with open(postings_file, "rb") as pfile:
            for query in qfile:
                print "query: ", query
                prefix = parser.to_polish_notation(query)
                print "prefix: ", prefix
                processed = []
                for token in prefix:
                    if parser.is_operand(token):
                        token = stemmer.stem(token).lower()
                    processed.append(token)

                print "processed: ", processed
                query = parser.process_query(processed)
                print "query: ", query
                result = execute_query(query, dictionary, pfile)

                print result
def tokenStem(words):
    words = words.strip('[').strip(']').lower() #remove brackets and lowercase
    words = re.sub('[(){}<>:,.!?\'"]', '', words)
    stemmer = PorterStemmer()
    stops = stopwords.words('english')
    output = [stemmer.stem(token) for token in wordpunct_tokenize(words) if token not in stops ] #stem words
    return " ".join(output) #merge into strings
示例#4
0
	def AddTopicUnigram(self, feaName,comName, data = None):	
	#need mapping first
		if data is None:
			data =self._data
			
		for i in range(len(data)):	
			t_bigram = self.getEssayCollocation(data, i)
			
			t_uni = list()
			for (a, b) in t_bigram:
				t_uni.append(a)
				t_uni.append(b)
			t_uni = set(t_uni)
			
			comment = data[i][comName]
			tokens = nltk.wordpunct_tokenize(comment)
			tokens = [word.lower() for word in tokens]
		
			#stemming
			if self._stemoption ==True:
				st = PorterStemmer()
				tokens = [st.stem(t) for t in tokens]
				t_uni  = set([st.stem(t) for t in list(t_uni)])
			shared = [w for w in tokens if w in t_uni]
			#normalized
			data[i][feaName] = float(len(shared))/(len(tokens)+0.00001)
示例#5
0
def cleanData(doc_list):
  # tokenize
  tokens = []
  for doc in doc_list:
    text_l = []
    ws_split = re.split(split_on, doc)
    for w in ws_split:
      # remove URLs and empty strings
      if not (url_pat.match(w) or w == u''):
        text_l.append(w)
  
    # rejoin text and 'properly' tokenize
    text = " ".join(text_l)
    text_l = nltk.word_tokenize(text)
    
    # stop words 
    text_l = [ w.lower() for w in text_l if w.lower() not in stops]
  
    # stemming
    p_stemmer = PorterStemmer()
    text_l = [p_stemmer.stem(t) for t in text_l]
    
    ## append cleaned text to list
    tokens.append(text_l)
  return tokens
示例#6
0
def extractFeatures(dataSet):
    vector1, vector2 = list(), list()
    
    stemmer = PorterStemmer()
    # Produces list of all unique word stems in the titles in the dataset
    wordBag = list({stemmer.stem(word) for entry in dataSet for word in entry[2].strip().split(" ") if not word in stopwords.words('english')})


    for entry in dataSet:
        genre, isbn, title, authors = entry[0], entry[1].strip(), entry[2].strip(), entry[3].strip()

        wordList, authorList = [word for word in title.split(" ")], [author.strip() for author in authors.split(";")]
        sortedWords = sorted(wordList, key = lambda x: -1*len(x))
        nonStopWords = [word for word in sortedWords if not word in stopwords.words('english')]
        stemmedWords = [stemmer.stem(word) for word in nonStopWords]

        # Quantitative data about the title
        shortestWord = len(nonStopWords[-1])
        longestWord = len(nonStopWords[0])
        meanWord = sum([len(word) for word in nonStopWords])/len(nonStopWords)
        wordSD = (sum([(len(word)-meanWord)**2 for word in nonStopWords])/len(nonStopWords))**.5

        vector1.append([(len(authorList), len(wordList), longestWord, shortestWord, meanWord, wordSD), genre])
        
        # Creates a vector storing whether a word in a dataset occurred in the title
        occurrences = tuple(1 if word in stemmedWords else 0 for word in wordBag)
        
        vector2.append([occurrences, genre])

    return (vector1,vector2)
def lda(data):
	data = get_only_text(data)
	only_tweet = data
	length = len(only_tweet)
	length = min(20,length)
	for i in xrange(0,length):
		print i
		print only_tweet[i]
	return
	
	tokenizer = RegexpTokenizer(r'\w+')
	en_stop = get_stop_words('en')
	p_stemmer = PorterStemmer()

	length = len(only_tweet)
	length = min(20,length)
	total_texts = []
	for i in xrange(0,length):
		print only_tweet[i]
		print 
		to_lower = only_tweet[i].lower()
		tokens = tokenizer.tokenize(to_lower)
		stopped_tokens = [k for k in tokens if not k in en_stop]
		texts = [p_stemmer.stem(k) for k in stopped_tokens]
		total_texts.append(texts)

	dictionary = corpora.Dictionary(total_texts)
	corpus = [dictionary.doc2bow(text) for text in total_texts]

	ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
	result =  ldamodel.print_topics(num_topics=2, num_words=1)
	for i in result:
		print i
示例#8
0
	def createLDAModel(texts, n_topics, n_passes):
		"""Generates a LDA model from an array of texts
		"""
		tokenizer = RegexpTokenizer(r'\w+')
		#Create EN stop words list
		en_stop = get_stop_words('en')
		#Create p_stemmer of class PorterStemmer
		p_stemmer = PorterStemmer()

		texts_ = []

		# loop through document list
		for i in texts:
		    
		    # clean and tokenize document string
		    raw = i.lower()
		    tokens = tokenizer.tokenize(raw)
		    
		    # remove stop words from tokens
		    stopped_tokens = [i for i in tokens if not i in en_stop]
		    # stem tokens
		    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
		    # add tokens to list
		    texts_.append(stemmed_tokens)

		# turn our tokenized documents into a id <-> term dictionary
		dictionary = corpora.Dictionary(texts_)

		# convert tokenized documents into a document-term matrix
		corpus = [dictionary.doc2bow(text) for text in texts_]

		# generate LDA model
		ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes)

		return(ldamodel)
示例#9
0
def main():

    rake=RAKE.Rake('SmartStoplist.txt')
    fp=open(input_file,'r')
    text=fp.read()
    text=text_clean(text)
    """wnl=WordNetLemmatizer()
    text=' '.join([wnl.lemmatize(i.strip()) for i in nltk.word_tokenize(text)])"""
    porter_stemmer=PorterStemmer()
    text=' '.join([porter_stemmer.stem(i.strip()) for i in nltk.word_tokenize(text)])
    keywords=rake.run(text)
   # print keywords

    with open(key_score_file,'wb') as out:
        csv_out=csv.writer(out)
        csv_out.writerow(['KEYWORD','SCORE'])
        for row in keywords:
            if row[1]>0:
                csv_out.writerow(row)


    unibitrigram_list=[]
    unibitrigram_list=generate_unibitrigrams(key_score_file)
    #print unibitrigram_list
    #ngram_freq=[]
    ngram_freq=Counter(unibitrigram_list)
    sorted_ngram_freq=sorted(ngram_freq.items(),key=lambda x:x[1],reverse=True )
    print ngram_freq
    with open('bcom_ngramfr_stem.csv','wb') as nf_csv:
        csv_wr=csv.writer(nf_csv)
        for item in sorted_ngram_freq:
            if ((item[0]!='')):
                csv_wr.writerow(item)
def Tokenize(TextData):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = list()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # clean and tokenize document string
    raw = TextData.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    tokens = stemmed_tokens

    TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt")
    fp = open(TOKENIZEDTEXT_FILE, "w")
    print(TOKENIZEDTEXT_FILE)
    # pickle.dump(tokens, fp)
    fp.write(str(tokens))
    fp.close()
示例#11
0
def get_stemmed_separate(indeed_reviews_db, glassdoor_reviews_db):
    separate = get_separate_reviews(indeed_reviews_db, glassdoor_reviews_db)
    stemmer = PorterStemmer()
    stemmed_reviews = []
    for review in separate:
        stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())]))
    return stemmed_reviews
示例#12
0
	def create_bag_of_words(self):
		"""Create a BagOfWords for the document. Performs named entity recognition, stemming and stopword removal. """
		stemmer = PorterStemmer()
		nes = []
		tagged_text = self.ner_tagger.get_entities(self.content.encode('utf-8'))
		for key in tagged_text.keys():
			if key != 'O':
				nes += tagged_text[key]
		for n in nes:
			self.bag_of_words.add_stem_word(n, n)
			Document.vocabulary.add_stem_word(n, n)

		wo_named = re.sub('|'.join(nes), '', self.content)

		words = re.findall(r'\w+', wo_named,flags = re.UNICODE | re.LOCALE) 
		for wordo in words:
			word = wordo.rstrip(r'\n')
			if word.lower() not in stopwords:
				w = stemmer.stem(word.lower())
				self.bag_of_words.add_stem_word(w, word)
				Document.vocabulary.add_stem_word(w, word)

		for word in self.bag_of_words.get_all_words():
			if word in Document.document_word_frequency:
				Document.document_word_frequency[word] += 1
			else:
				Document.document_word_frequency[word] = 1
示例#13
0
def evaluate(query):
	global DICTIONARY
	word_score = {}
	seek_pos = open(postings_file, 'r')
	seek_pos.seek(0,0)
	words = query.split()
	stemmer = PorterStemmer()
	words = [element.lower() for element in words]
	for item in words:
		word = stemmer.stem(item)
		if word not in word_score:	
			if word in DICTIONARY:
				seek_pointer = DICTIONARY[word]
				seek_pos.seek(int(seek_pointer))
				line = seek_pos.readline()
				seek_pos.seek(0,0)
				post_list = line.split()
				score = score_documents(post_list)
				word_score[word] = score
			else:
				#not encountered, score of 0
				word_score[word] = []
		#else duplicate, skip word
	result = score_query(word_score)
	return result
def stemText(s):
	ps = PorterStemmer()
	stemmedText = []
	for word in s:
		stemmedText.append(ps.stem(word))
		
	return stemmedText
示例#15
0
文件: LoadData.py 项目: suket22/CS246
    def parse_questions(self):
        stemmer = PorterStemmer()
        tokenizer = RegexpTokenizer(r'\w+')
        for questions_key in self.rawSamples:
            # Stem the Question Text
            question_text = self.rawSamples[questions_key][0]
            words_array = tokenizer.tokenize(question_text)
            question_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                question_text += (word + " ")
            self.rawSamples[questions_key][0] = question_text

            # Stem the topic names
            topics_text = self.rawSamples[questions_key][2]
            words_array = tokenizer.tokenize(topics_text)
            topics_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                topics_text += (word + " ")
            self.rawSamples[questions_key][2] = topics_text
示例#16
0
def clean_split_stem(rawstring):
    stop = stopwords.words('english')
    out_str = rawstring.split()
    porter = PorterStemmer()
    out_str = [porter.stem(word) for word in out_str]
    out_str = [word for word in out_str if word not in stop]
    return out_str
def extract_entities(doc):
    print 'extracting entities from %s...' % doc.getFilename()
    nps = list(set([re.sub(' \.', '', re.sub(' -[A-Z]{3}-', '', np).lower()) for np in doc.getAllNodesOfType('NP')]))
    p = PorterStemmer()
    entities = []
    for np in nps:
        try:
            response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content)
        except requests.exceptions.ConnectionError:
            while True:
                time.sleep(15)
                print 'retrying connection...'
                try:
                    response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content)
                    break
                except requests.exceptions.ConnectionError:
                    continue
        docs = response[u'response'][u'docs']
        if len(docs) > 0:
            titles = [docs[0][u'title_en']] + docs[0].get(u'redirect_titles_mv_en', [])
        else:
            titles = []
        if len(titles) > 0:
            titles = [' '.join([p.stem(w.lower()) for w in t.split(' ')]) for t in titles]
        stem_np = ' '.join([p.stem(w) for w in np.split(' ')])
        for title in titles:
            if stem_np == title:
                entities.append(np)
                print np
                break
    #print doc.getFilename(), entities
    return (doc.getFilename(), entities)
示例#18
0
def preprocess_text(raw):
    lower_raw = raw.lower()
    tokens = nltk.word_tokenize(lower_raw)
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    port = PorterStemmer() #This extracts the important root of a word. eg. parsing -> pars
    stemmed = [port.stem(item) for item in tokens]
    return stemmed
示例#19
0
def tokenize(docs, norm, stop, ne, central_per=None, central_loc=None, central_org=None):

    if stop:
        with open("stopwords.txt", "r") as f:
            sw = set([word.strip().decode("utf-8").lower() for word in f])

    if norm == "stem":
        from nltk.stem.porter import PorterStemmer
        stemmer = PorterStemmer()

    all_toks = []
    for doc in docs:
        toks = []
        for sent in doc:
                if norm == "lemma":
                    stoks = [unicode(tok.lem).lower() for tok in sent]
                elif norm == "stem":
                    stoks = [stemmer.stem(unicode(tok).lower())
                             for tok in sent]
                else:
                    stoks = [unicode(tok).lower() for tok in sent]
                if stop:
                    toks.extend([tok for tok in stoks if tok not in sw])
                else:
                    toks.extend(stoks)
        toks = [tok for tok in toks if len(tok) < 50]
        #if len(toks) == 0: continue
        string = u" ".join(toks).encode("utf-8")
        #print string
        all_toks.append(string)
    return all_toks
def PreProcessing(line):
    unigrams = line.split()
    word_list = [word.lower() for word in unigrams if word.lower() not in stopwords]
    st = PorterStemmer()
    word_list = [st.stem(word) for word in word_list if word]
    vocab = [word for word in word_list if word not in stopwords]
    return vocab
示例#21
0
def text_process(text):
    '''
    Takes in a string of text, then performs the following
    1. Tokenizes and removes punctuation
    2. Removes stopwords
    3. Stems
    4. Returns a list of the cleaned text
    '''
    if(pd.isnull(text)):
        return []
    
    # Tokenize 
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed = tokenizer.tokenize(text)
    
    # Removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # Stemming
    porterStemmer = PorterStemmer()
    
    text_processed = [porterStemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
        
    except:
        pass
    
    return " ".join(text_processed)
def pre_processing(resume):
    unigrams = resume.split()
    word_list = [word.lower() for word in unigrams if word.lower() not in stopwords]
    st = PorterStemmer()
    word_list = [st.stem(word) for word in word_list if word]
    vocab = [word for word in word_list if word not in stopwords]
    return vocab
def issue_analysis(df):
    df_sub = df[['Issue']]
    df_sub.insert(0, 'count', 1)

    Issue_List=[]
    for i in range(0,50):
        Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)

    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop
    text_view = ''
                                                                
    # loop through document list
    for i in Issue_List:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
       
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        # stem tokens and add them to list
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

        #print ' '.join(stemmed_tokens)
        text_view += ' '.join(stemmed_tokens)
        text_view += ' '

    wordcloud = WordCloud().generate(text_view)
    fig = plt.figure(figsize=(8,6))
    fig1 = fig.add_subplot(1,1,1)
    fig1.set_title("Top issued words", fontdict={'fontsize':25})
    fig1.imshow(wordcloud)
    fig1.axis("off")
    #plt.savefig('ComplainCount_WC.png')
    plt.savefig('ComplainCount_WC_2016.png')
    
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "issue_lda.html")
    #pyLDAvis.save_json(vis_data, "issue_lda.json")
    pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
    pyLDAvis.save_json(vis_data, "issue_lda_2016.json")

    return 0
示例#24
0
def processing(raw_review):
    word1=[]    
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    # 2. Remove Punctuations        
    letters_only = remove_punctuations(review_text) 
    # 3. Convert to lower case, split into individual words
    for words in letters_only:
        wordset=[word.lower() for word in words]
        word1.append(wordset)                       
    #4Handling Double Negation
    negated_words=negation_handling(word1)
    #5 Read only verbs,adjectives,adverbs,interjections (descriptive words)  
    meaningful_words=descriptive_words(negated_words)           
    #6 Remove Time, Location, Organization, Person, Money, Percent, Date using NER   
    #removed_words=remove_names(meaningful_words)    
    #7. Remove stop words    
    stops =open(r'C:\Users\PSarka\Desktop\sentimentanalysis\stopwords.txt','r')   
    stops= set([word[:-1] for word in stops])  
    meaningful_words_new = [w for w in meaningful_words if not w in stops]    
    #6.Stemming using Porter Stemmer,Lemming can also be used check which is more efficient
    st=PorterStemmer()
    stemmed_words=[st.stem(words) for words in meaningful_words_new]   
    #7. Join the words back into one string separated by space, 
    # and return the result.
    print stemmed_words    
    return( " ".join(stemmed_words ))   
示例#25
0
def tweet_stemming(tweet, token_freqs):

    """
    Stems tweets words and counts diversty
    
    :param tweet: the tweet to analyze
    :type tweet: str or unicode

    :param token_freqs: counter of words frequency
    :type token_freqs: Counter

    :returns: words added to token_freqs
    :rtype: int
    """
    
    pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)'
    regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    porter = PorterStemmer()

    counter_tokens = 0
    tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE)  # remove URL
    tweet_url_removed_tokenized = word_tokenize(tweet_url_removed)  # tokenize tweet
    tweet_url_removed_tokenized_cleaned_stemming = []  # cleaned of URLs and hashs, and stemming

    for token in tweet_url_removed_tokenized:
        new_token = regex_punctuation.sub(u'', token)  # remove punctuation and hash
        if not new_token == u'':
            new_token_stemming = porter.stem(new_token)
            tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming)
            token_freqs[new_token_stemming] += 1
            counter_tokens += 1
    
    return counter_tokens
示例#26
0
文件: Gibbs.py 项目: wylswz/FYPLinux
def query(new_doc,doc_topic,topic_word,dictionary,LSH,num_topic):
    tokens = []
    token = get_tokens(new_doc)
    stopped_tokens = [i for i in token if not i in en_stop]
    p_stemmer = PorterStemmer()
    stemed_tokens = []
    for i in stopped_tokens:
        try:
            temp_token = str(p_stemmer.stem(i))
            stemed_tokens.append(temp_token)
        except IndexError:
            pass
    tokens = stemed_tokens
    new_corpus=dictionary.doc2bow(tokens)
    new_corpus = to_gibbs_corpus([new_corpus])[0] ##convert 
    new_topic_vector = np.zeros(num_topic)
    
    for t in new_corpus:
        mult_par = topic_word[:,t[0]] + 1
        mult_par = mult_par/np.sum(mult_par)
        new_topic_vector += np.random.multinomial(t[1],mult_par)
        #print mult_par
        #print topic_word[:,t[0]]
    
    new_topic_vector = new_topic_vector/np.sum(new_topic_vector)
    dist,indices=LSH.kneighbors(new_topic_vector,n_neighbors=20)
    print indices+1
示例#27
0
文件: wikipedia.py 项目: slee17/NLP
def compare_english_simple(article_title):
    """Given a title of an article, returns the number of tokens, types, and stems
    in both the English version and the simple English version."""
    english = extract_wikipedia_page(article_title, "en")
    simple = extract_wikipedia_page(article_title, "simple")
    num_tokens_english = len(english)
    num_tokens_simple = len(simple)
    types_english = count_words(get_words(english))
    types_simple = count_words(get_words(simple))
    
    porter_stemmer = PorterStemmer()
    
    stem_english = defaultdict(int)
    stem_simple = defaultdict(int)
    for key in types_english.keys():
        stem_english[porter_stemmer.stem(key)] += 1
    for key in types_simple.keys():
        stem_simple[porter_stemmer.stem(key)] += 1
    
    print ("Number of Tokens in English " + article_title + ": %d" % num_tokens_english)
    print ("Number of Tokens in Simple English " + article_title + ": %d" % num_tokens_simple)
    print ("Number of Types in English " + article_title + ": %d" % len(types_english))
    print ("Number of Types in Simple English " + article_title + ": %d" % len(types_simple))
    print ("Number of Stems in English " + article_title + ": %d" % len(stem_english))
    print ("Number of Stems in Simple English " + article_title + ": %d" % len(stem_simple))
示例#28
0
 def destem(self, stemmed_term, corpus):
     '''
     Given a stemmed term, we look through the text of every document
     in corpus, determine the most common "parent" version of the 
     given stemmed term, and return it. 
     '''
     destemmed_term = ""
     min_num_terms = 5000
     min_percentage = 0.20
     candidates = {}
     stemmer = PorterStemmer()
     num_terms_checked = 0
     num_docs_checked = 0
     total_matches = 0
     
     for doc in corpus:
         # matches is the list of all term in the current text that are
         # "ancestor" versions of the stemmed term.
         matches = ([term for term in doc.split_text 
                     if stemmer.stem(term) == stemmed_term])
         num_terms_checked += len(doc.split_text)
         num_docs_checked += 1
         total_matches += len(matches)
         if not matches:
             continue
         # we keep a tally of the number of times each "ancestor"
         # appears in our text
         for match in matches:
             if match in candidates:
                 candidates[match] += 1
             else:
                 candidates[match] = 1
         # sort potential destemmed versions in descending order
         # by frequency
         sorted_candidates = sorted(candidates.keys(), 
                                    key=lambda 
                                    term: candidates[term], 
                                    reverse=True)
         if num_docs_checked == self.num_corpus_docs: 
             # we've run through every doc, so the most frequent 
             # ancestor of the stemmed term is the best destemmed 
             # result.
             destemmed_term = sorted_candidates[0]
             break
         # if we've reviewed enough total words, we can start trying
         # to find a suitable destemmed term from what we have so far 
         if min_num_terms <= num_terms_checked:
             # this is the most frequent ancestor of the stemmed term
             possible_match = sorted_candidates[0]
             test_percentage = candidates[possible_match] \
                                 / float(total_matches)
             # if the potential destemmed version accounts for a 
             # sufficient percentage of the total matches, we can
             # decide that it's a suitable destemmed result.
             if min_percentage <= test_percentage:
                 destemmed_term = possible_match
                 break
             
     print("Destemmed: {0} --> {1}".format(stemmed_term, destemmed_term))
     return destemmed_term
示例#29
0
def read_class_data(path, label=None):
    '''
    Label may come from the data itself, may be assigned at run time
    '''
    if os.path.exists(path):
        if os.path.isdir(path):
            paths = [os.path.join(path, f) for f in os.listdir(path)]
        else:
            paths = [path]
    else:
        print 'Given path does not exist.'
        return
    
    doc = doc_file()
    stemmer = PorterStemmer()
    instances = []
    for p in paths:
        doc.path = p
        for raw_record in doc:
            record = unpack(raw_record, ',')
            text = record[3].strip('"')
            inst = {'tokens': [], 'label': ''}
            for t in wordpunct_tokenize(text):
                stem_t = stemmer.stem(t.lower())
                if stem_t[0].islower():
                    inst['tokens'].append(stem_t)
                else:
                    continue
            inst['label'] = label
            instances.append(inst)
    return instances
示例#30
0
def processEmail(email_contents):
    vocabList = getVocabList()
    word_indices = []
    # Preprocss Email
    email_contents = email_contents.lower()
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)
    email_contents = re.sub('[0-9]+', 'number', email_contents)
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)
    email_contents = re.sub('[$]+', 'dollar', email_contents)
    print('==== Processed Email ====')
    
    pattern = '[\s' + re.escape("@$/#.-:&*+=[]?!(){},'\">_<;%") + ']'
    all_words = re.split(pattern, email_contents)
    all_words = [x for x in all_words if x != '']

    stemmer = PorterStemmer()

    for w in all_words:
        w = re.sub('[^a-zA-Z0-9]', '', w)
        w = w.strip()
        w = stemmer.stem(w)
        # ============= YOUR CODE HERE =============
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary.
        try:
            idx = vocabList.index(w)
        except ValueError:
            idx = -1
        if idx is not -1:
            word_indices.append(idx)
        # ===========================================
    return word_indices
示例#31
0
    # delete punctuation
    for c in string.punctuation:
        msg = re.sub(r'\{}'.format(c), '', msg)

    # delete separator i.e. \n \t
    msg = ' '.join(msg.split())

    return msg


# In[15]:

nltk.download('words')  #pull thai word(Bags)
th_stop = tuple(thai_stopwords())
en_stop = tuple(get_stop_words('en'))
p_stemmer = PorterStemmer()

# In[16]:


def split_word(text):

    tokens = word_tokenize(text, engine='newmm')

    # Remove stop words
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    # หารากศัพท์ภาษาไทย และภาษาอังกฤษ
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]
示例#32
0
def main():
    # storm = 'private'
    # data = read_data(storm)
    # print("Length of Data: {length}".format(length=len(data)))''
    num_topics = 3
    tokenizer = RegexpTokenizer(r'[a-z0-9\']+')
    p_stemmer = PorterStemmer()

    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Sandy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    data_sandy = read_data("sandy")
    try:
        with open("storm_extracts/dict_sandy", 'rb') as f:
            dict_sandy = pickle.load(f)
        with open("storm_extracts/counts_sandy", 'rb') as f:
            counts_sandy = pickle.load(f)
    except:
        dict_sandy, _, counts_sandy = parse_text(data_sandy, "sandy",
                                                 tokenizer, en_stop, p_stemmer)
        print("Length of Data: {length}".format(length=len(data_sandy)))
        with open("storm_extracts/dict_sandy", "wb") as fp:  #Pickling
            pickle.dump(dict_sandy, fp)
        with open("storm_extracts/counts_sandy", "wb") as fp:  #Pickling
            pickle.dump(counts_sandy, fp)
    lda_sandy = run_model(data_sandy, "sandy", num_topics)

    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Harvey ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    data_harvey = read_data("harvey")
    try:
        with open("storm_extracts/dict_storm", 'rb') as f:
            dict_harvey = pickle.load(f)
        with open("storm_extracts/counts_storm", 'rb') as f:
            counts_harvey = pickle.load(f)
    except:
        dict_storm, _, counts_storm = parse_text(data_storm, "harvey",
                                                 tokenizer, en_stop, p_stemmer)
        print("Length of Data: {length}".format(length=len(data_storm)))
        with open("storm_extracts/dict_storm", "wb") as fp:  #Pickling
            pickle.dump(dict_harvey, fp)
        with open("storm_extracts/counts_storm", "wb") as fp:  #Pickling
            pickle.dump(counts_harvey, fp)
    lda_harvey = run_model(data_harvey, "harvey", num_topics)

    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Florence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    data_Florence = read_data("Florence")
    dict_Florence, _, counts_Florence = parse_text(data_Florence, "Florence",
                                                   tokenizer, en_stop,
                                                   p_stemmer)
    lda_florence = run_model(data_Florence, "Florence", num_topics)

    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Lane ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    data_Lane = read_data("Lane")
    dict_Lane, _, counts_Lane = parse_text(data_Lane, "Lane", tokenizer,
                                           en_stop, p_stemmer)
    lda_lane = run_model(data_Lane, "Lane", num_topics)

    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Michael ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    data_Michael = read_data("Michael")
    dict_Michael, _, counts_Michael = parse_text(data_Michael, "Michael",
                                                 tokenizer, en_stop, p_stemmer)
    lda_michael = run_model(data_Michael, "Michael", num_topics)

    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running Bonnie ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    data_bonnie = read_data("bonnie")
    dict_bonnie, _, counts_bonnie = parse_text(data_bonnie, "bonnie",
                                               tokenizer, en_stop, p_stemmer)
    lda_bonnie = run_model(data_bonnie, "bonnie", num_topics)

    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running private ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    data_private = read_data("private")
    dict_private, _, counts_private = parse_text(data_private, "private",
                                                 tokenizer, en_stop, p_stemmer)
    lda_private = run_model(data_private, "private", num_topics)

    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running noise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    data_noise = read_data("noise")
    try:
        with open("storm_extracts/dict_noise", 'rb') as f:
            dict_noise = pickle.load(f)
        with open("storm_extracts/counts_noise", 'rb') as f:
            counts_noise = pickle.load(f)
    except:
        dict_noise, _, counts_noise = parse_text(data_noise, "noise",
                                                 tokenizer, en_stop, p_stemmer)
        print("Length of Data: {length}".format(length=len(data_noise)))
        with open("storm_extracts/dict_noise", "wb") as fp:  #Pickling
            pickle.dump(dict_noise, fp)
        with open("storm_extracts/counts_noise", "wb") as fp:  #Pickling
            pickle.dump(counts_noise, fp)
    lda_noise = run_model(data_noise, "noise", num_topics)

    models = [
        lda_sandy, lda_harvey, lda_florence, lda_lane, lda_michael, lda_bonnie,
        lda_private, lda_noise
    ]
    model_names = [
        "lda_sandy", "lda_harvey", "lda_florence", "lda_lane", "lda_michael",
        "lda_bonnie", "lda_private", "lda_noise"
    ]

    print("Printing sorted scores per model...")
    scores = []
    for i in range(len(model_names)):
        print("COMPARING DATASETS TO: {a}".format(a=model_names[i]))
        per_model = []
        for j in range(len(model_names)):
            if i == j:
                continue
            # print("Comparing {a} with {b}".format(a=model_names[i], b=model_names[j]))
            dist = compare_models(models[i], models[j])
            scores.append((i, j, dist))
            per_model.append((i, j, dist))

        per_model = sorted(per_model, key=operator.itemgetter(2))
        for i, j, dist in per_model:
            print("Comparing {a} with {b}".format(a=model_names[i],
                                                  b=model_names[j]))
            print("Hellinger distance:", dist)
        print("\n")

    print("Printing total sorted scores...")
    scores = sorted(scores, key=operator.itemgetter(2))
    for i, j, dist in scores:
        print("Comparing {a} with {b}".format(a=model_names[i],
                                              b=model_names[j]))
        print("Hellinger distance:", dist)
示例#33
0
import nltk, math
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from math import log10, sqrt
from collections import Counter
import os
import io
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
corpusroot = './presidential_debates'  #My subdiretory name
vectors = {}  #tf-idf vectors for all documents
df = Counter()  #storage for document frequency
tfs = {}  #permanent storage for tfs of all tokens in all documents
lengths = Counter()  #used for calculating lengths of documents
postings_list = {}  #posting list storage for each token in the corpus
st_tokens = []
for filename in os.listdir(corpusroot):
    file = io.open(os.path.join(corpusroot, filename), "r", encoding='UTF-8')
    doc = file.read()
    file.close()
    doc = doc.lower()  #given code for reading files and converting the case
    tokens = tokenizer.tokenize(doc)  #tokenizing each document
    sw = stopwords.words('english')
    tokens = [stemmer.stem(token) for token in tokens
              if token not in sw]  #removing stopwords and performing stemming
    tf = Counter(tokens)
    df += Counter(list(set(tokens)))
    tfs[filename] = tf.copy()  #making a copy of tf into tfs for that filename
    tf.clear()  #clearing tf so that the next document will have an empty tf
示例#34
0
               see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect?
               I have a third vision. India must stand up to the world. Because I believe that unless India 
               stands up to the world, no one will respect us. Only strength respects strength. We must be 
               strong not only as a military power but also as an economic power. Both must go hand-in-hand. 
               My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of 
               space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material.
               I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. 
               I see four milestones in my career"""

# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
wordnet = WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [
        wordnet.lemmatize(word) for word in review
        if not word in set(stopwords.words('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)

# Creating the TF-IDF model
示例#35
0
#Importing the Dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [
        ps.stem(word) for word in review
        if not word in set(stopwords.words('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)
''' To clean HTML tags
import re
 
def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext
'''
 def __init__(self):
     self.model_path = "./model.joblib"
     self.stemmer = PorterStemmer()
     self.clf, self.cv = self.trainer()
class EmoClf:
    def __init__(self):
        self.model_path = "./model.joblib"
        self.stemmer = PorterStemmer()
        self.clf, self.cv = self.trainer()

    def trainer(self):
        # import pdb; pdb.set_trace()
        if not os.path.exists(self.model_path):
            reviews_train = self.data_loader()
            reviews_train_clean = self.clean(reviews_train)
            stemmed_reviews_list = self.get_stemmed_text(reviews_train_clean)
            clf, cv = self.stemmed_review(stemmed_reviews_list)
            self.feat_to_coeff(clf, cv)
            joblib.dump(clf, self.model_path)
            with open('./storage.bin', 'wb') as f:
                cPickle.dump(cv, f)
        else:
            clf = joblib.load(self.model_path)
            with open('storage.bin', 'rb') as f:
                cv = cPickle.load(f)

        return clf, cv

    def tester(self, query):

        reviews_train_clean = self.clean([query])
        stemmed_reviews_list = self.get_stemmed_text(reviews_train_clean)
        vectorized_test = self.cv.transform(stemmed_reviews_list)
        pred_proba = self.clf.predict_proba(vectorized_test)
        # print (pred_proba)
        emotion_dct = {"positive": pred_proba, "negative": 1 - pred_proba}
        return emotion_dct

    def data_loader(self):
        reviews_train = []

        with open('./data/training/full_train.txt', 'r') as fp:
            reviews_train = [
                each_line.strip() for each_line in fp.readlines()[:]
            ]

        return reviews_train

    def preprocess_reviews(self, reviews):
        REPLACE_NO_SPACE = re.compile(
            r"(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)"
        )
        REPLACE_WITH_SPACE = re.compile(r"(<br\s*/><br\s*/>)|(\-)|(\/)")
        NO_SPACE = ""
        SPACE = " "
        reviews = [
            REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews
        ]
        reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]

        return reviews

    def clean(self, reviews_list):
        reviews_list_clean = self.preprocess_reviews(reviews_list)
        return reviews_list_clean

    def get_stemmed_text(self, corpus):

        return [
            ' '.join([self.stemmer.stem(word) for word in review.split()])
            for review in corpus
        ]

    def stemmed_review(self, stemmed_reviews_list):
        #reviews_train

        cv = CountVectorizer(binary=True)
        cv.fit(stemmed_reviews_list)
        X = cv.transform(stemmed_reviews_list)
        target = [1 if i < 12500 else 0 for i in range(25000)]

        X_train, X_val, y_train, y_val = train_test_split(X,
                                                          target,
                                                          train_size=0.75)

        clf = LogisticRegression(C=0.05)
        clf.fit(X, target)
        # print ("Final Accuracy: %s"
        #     % accuracy_score(target, clf.predict(X_test)))
        return clf, cv

    def feat_to_coeff(self, clf, vectorized):

        feature_to_coef = {
            word: coef
            for word, coef in zip(vectorized.get_feature_names(), clf.coef_[0])
        }

        for best_positive in sorted(feature_to_coef.items(),
                                    key=lambda x: x[1],
                                    reverse=True)[:30]:
            print(best_positive)

        print("\n\n")
        for best_negative in sorted(feature_to_coef.items(),
                                    key=lambda x: x[1])[:30]:
            print(best_negative)
示例#38
0
 def processQueryToDoStemming(self, words):
     stems = []
     stemmer = PorterStemmer()
     for word in words:
         stems.append(stemmer.stem(word))
     return stems
import matplotlib.pyplot as plt
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

df=pd.read_csv("train.csv",encoding="ISO-8859-1")

df.isnull().sum()

y=df["Sentiment"]

message=df["SentimentText"]

ps=PorterStemmer()

#--------------------------------------removing @username from tweet----------------------------------
message=message.str.replace("@[\w]*","")

#------------------------------------------REMOVING HYPERLINK FROM TWEET-----------------------------
message = message.str.replace('https?:\/\/\S+', '')

#----------------------------------------------removing RT from tweet--------------------------------
message=message.str.replace('RT[\s]+', '')


#---------------------------------------removing unwanted symbols and stopwords-----------------------
corpus=[]
for i in range(len(message)):
    review=re.sub("[^a-zA-Z]"," ",message[i])
import nltk
import string
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

#from gensim.summarization import keywords
#from textblob import TextBlob

import matplotlib.pyplot as plt

state = 'Georgia'
#n6download_shell()
with open('tweets_{}.json'.format(state), 'r') as f:
    tweets = json.load(f)

ps = PorterStemmer()
#wnl = WordNetLemmatizer()
stem_tweet = []
stopwords = nltk.corpus.stopwords.words('english')

p = string.punctuation
d = string.digits
table_p = string.maketrans(p, len(p) * " ")
table_d = string.maketrans(d, len(d) * " ")

wordcloud_tweet = []
txt1 = ''
txt2 = ''
for twt in tweets:
    tx = unicodedata.normalize('NFKD', twt).encode('ascii', 'ignore')
    txt1 = tx.translate(table_p)
示例#41
0
import dataset
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words

db = dataset.connect('sqlite:///news.db')

articles = []

tokenizer = RegexpTokenizer(r'\w+')
stop_words = get_stop_words('en')
p_stemmer = PorterStemmer()

for article in db['articles'].all():
    text = article['title'].lower().strip()
    text += " " + article['textContent'].lower().strip()
    if not text:
        continue
    # Tokenize
    tokens = tokenizer.tokenize(text)
    # Remove stop words and small words
    clean_tokens = [i for i in tokens if not i in stop_words]
    clean_tokens = [i for i in clean_tokens if len(i) > 2]
    # Stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in clean_tokens]
    # Add to list
    articles.append((article['title'], stemmed_tokens))

print(articles[0])

from gensim import corpora
dataset.hist(column='length', by='feedback', bins=50,figsize=(10,4))

#importing the dataset again
data=pd.read_csv('data.tsv', delimiter = '\t', quoting = 3)

#cleaning the texts and stemming the texts
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus=[]
for i in range(0,3150):
    review = re.sub('[^a-zA-Z]', ' ', data['verified_reviews'][i] )
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)
    
# creating the Bag of words Model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500)
X=cv.fit_transform(corpus).toarray()
y=data.iloc[:,4].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Feature Scaling
file.close()

df4['abstract']=df4['abstract'].apply(lambda x: " ".join(x for x in str(x).split() if not x.isdigit() and not x.isspace()))
df4['abstract']=df4['abstract'].str.replace('[^\w\s,]','')
#df4['abstract']=df4['abstract'].str.lower()


# Topic modeling with LDA and Gensim
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')
stop_plus = ['word', 'count', 'text', 'all', 'right', 'no', 'without', 'abstract', 'no', 'reuse', 'without', 'abstract', 'nan']

# Create PorterStemmer
p_stemmer = PorterStemmer()
# create list of documents
abstract_set = []
for abstract in df1['abstract'].dropna():
    abstract_set.append(abstract)

for abstract in df2['abstract'].dropna():
    abstract_set.append(abstract)

for abstract in df3['abstract'].dropna():
    abstract_set.append(abstract)

for abstract in df4['abstract'].dropna():
    abstract_set.append(abstract)

# list for tokenized documents in loop
# Cleaning the texts
import re
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 3896):
    
    text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
    text=dataset['text'][i]
    #'Not' is replaced by 'Nots' so that it will not be detected by stopwords.
    text=re.sub("not","nots",text)
    text = text.lower() 
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

#Comparison models
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from scipy.sparse import lil_matrix
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
def do_stemming(filtered):
    stemmed = []
    for f in filtered:
        stemmed.append(PorterStemmer().stem(f))
    return stemmed
示例#46
0
# Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [
        ps.stem(word) for word in review if not word in set(all_stopwords)
    ]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

# Splitting the dataset into the Training set and Test set
import nltk
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

#stemming
word_data = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms"
# First Word tokenization
nltk_tokens = nltk.word_tokenize(word_data)
#Next find the roots of the word
for w in nltk_tokens:
    print("Actual: %s  Stem: %s" % (w, porter_stemmer.stem(w)))

print('-----------------------------')
print('lemmatization')
#lemmatization
word_data2 = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms"
nltk_tokens = nltk.word_tokenize(word_data2)
for w in nltk_tokens:
    print("Actual: %s  Lemma: %s" % (w, wordnet_lemmatizer.lemmatize(w)))
示例#48
0
from sklearn.metrics.pairwise import cosine_similarity

#Stemming and Lemmatisation
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
# Get corpus and CountVector
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('wordnet')
nltk.download('stopwords')
lem = WordNetLemmatizer()
stem = PorterStemmer()
stop_words = set(stopwords.words("english"))
new_words = ['not_the']
stop_words = stop_words.union(new_words)


#Should 'because' added?
def preprocess(df, reset_list=[',', '.', '?', ';', 'however', 'but']):
    corpus = []
    for i in tqdm(range(df.shape[0])):
        text = df['review_text'][i]
        change_flg = 0
        #Convert to lowercase
        text = text.lower()

        ##Convert to list from string, loop through the review text
示例#49
0
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for index in dataset.index:
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][index])
    review = review.lower()
    review = review.split()
    #Remove words not useful for for NLP like the,and, or etc.
    review = [
        word for word in review if not word in set(stopwords.words('english'))
    ]
    #Stemming : finding root of the words
    porterStemmer = PorterStemmer()
    review = [porterStemmer.stem(wrd) for wrd in review]
    review = ' '.join(review)
    corpus.append(review)

#Creating Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
countVectorizer = CountVectorizer(max_features=1500)
X = countVectorizer.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

#Splitting the data into train and test
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
示例#50
0
文件: main.py 项目: amunds1/ntnu-1
paragraphs = list(map(lambda p: " ".join(p.split("\r\n")), paragraphs))

# 1.4 Tokenize
paragraphs = list(map(lambda p: p.split(), paragraphs))

# 1.5 Remove punctuation
import string
paragraphs = list(
    map(lambda p: list(map(lambda w: w.strip(string.punctuation).lower(), p)),
        paragraphs))
paragraphs = list(
    map(lambda p: list(filter(lambda w: len(w) != 0, p)), paragraphs))

# 1.6 Stem words
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
paragraphs = list(
    map(lambda p: list(map(lambda w: stemmer.stem(w), p)),
        paragraphs))  # Bottleneck

# 2.0 Build a dictionary
import gensim
dictionary = gensim.corpora.Dictionary(paragraphs)

# 2.1 Filter away stopwords
stopwords = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your".split(
    ",")
token2id = dictionary.token2id
stop_ids = [token2id[s] for s in stopwords if s in token2id]
dictionary.filter_tokens(stop_ids)
示例#51
0
    def __init__(self,
                 filenames,
                 parser,
                 batch_size=32,
                 stemmer=PorterStemmer(),
                 max_sentence_length=None,
                 build_voc=False,
                 voc_path='voc',
                 voc_threshold=1):

        self.filenames = filenames
        self.parser = parser
        self.batch_size = batch_size
        self.stemmer = stemmer
        self.len_doc = 0
        self.voc_threshold = voc_threshold

        if build_voc:
            print(self.filenames)
            vb = VocBuilder(self.filenames,
                            self.parser,
                            voc_path=voc_path,
                            voc_threshold=self.voc_threshold,
                            stemmer=stemmer)
            vb.build_vocab()
            #max_sentence_length = vb.max_sentence_length
            #print("max sentence length {}".format(max_sentence_length))

        voc_components = [
            'index2word.json', 'word2index.json', 'voc_summary.json'
        ]

        #sanity check to see if all vocab files are present
        try:
            for item in voc_components:
                assert item in os.listdir(voc_path)
        except FileNotFoundError:
            raise Exception(
                "voabulary has not been created, set build_voc = True in BatchGenerator contructor"
            )

        # counts the total number of lines in the input documents
        for filename in self.filenames:
            number_of_lines = 0
            f = codecs.open(filename, 'r', encoding="utf8", errors='ignore')
            for line in f:
                number_of_lines += 1
            self.len_doc += number_of_lines

        if self.batch_size is None:

            self.batch_size = self.len_doc

        print("loading voc data...")
        with open(os.path.join(os.getcwd(), voc_path,
                               'index2word.json')) as data_file:
            self.index2word = json.load(data_file)
        self.index2word[len(self.index2word)] = '<PAD>'

        with open(os.path.join(os.getcwd(), voc_path,
                               'word2index.json')) as data_file:
            self.word2index = json.load(data_file)
        self.word2index['<PAD>'] = 0
        with open(os.path.join(os.getcwd(), voc_path,
                               'voc_summary.json')) as data_file:
            self.voc_summary = json.load(data_file)
        if max_sentence_length is None:
            self.sequence_len = self.voc_summary['max_sequence_len']
        elif max_sentence_length > self.voc_summary['max_sequence_len']:
            self.sequence_len = self.voc_summary['max_sequence_len']
        else:
            self.sequence_len = max_sentence_length

        self.vocab_size = len(self.index2word)
        print("...voc data loaded")
示例#52
0
 def tokenize(self, text):
     stemmer = PorterStemmer()
     tokens = nltk.word_tokenize(text)
     stems = self.stem_tokens(tokens, stemmer)
     return stems
示例#53
0
example = "Automation automatic automated automotive"

example_lower = example.lower().split()
print(example_lower)



# In[14]:


#stemming
#import stemmer

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()


# In[15]:


#FOR loop for using Porter Stemmer

for word in example_lower:
    stemmed_word = ps.stem(word)
    print(stemmed_word)


# 
# #### Lemmatization:
# 
specify that tab...and quoting is used to ignore ""...'''

#cleaning of the text
import re 
import nltk
nltk.download('stopwords')  #this stopwords file contains the words  that are not relevant in reviews 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #this is used to change loved = loving  = love


corpus = []
for i in range(0,1000):
    reviews = re.sub('[^a-zA-Z]', ' ', dt['Review'][i])  # '^'  this sign is used not to remove alphabets
    reviews  = reviews.lower()  #to lower all alphabets
    reviews = reviews.split()
    ps = PorterStemmer()
    reviews = [ps.stem(word) for word in reviews if not word in set(stopwords.words('english'))] # set is used only to increse speed if we have a large review
    reviews = ' '.join(reviews)
    corpus.append(reviews)
    
#bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500) #we also can use stopwords parameter in countervectorizer and other functions
#max_feature will remove non_relavant word and change features from 1565 to 1500

X = cv.fit_transform(corpus).toarray()
y = dt.iloc[:,1].values

#fit the model naive bayes
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2,random_state =0)
label_y=LabelEncoder()
y_cat1=label_y.fit_transform(y)
onehotencoder=OneHotEncoder(categorical_features=[0])
y_cat=onehotencoder.fit_transform(y_cat1.reshape(-1,1)).toarray()
y_cat=y_cat[:,1:13]







##################################################################################
############ ########        train neural networks ############

ps=PorterStemmer()
corpus=[]
for i in range (0,40000):
    
    test_train=re.sub('[^a-zA-z]',' ',dataset['content'][i])
    test_train=test_train.lower()
    test_train=test_train.split()
    test_train=[ps.stem(word) for word in test_train   ]
    test_train=' '.join(test_train)
    corpus.append(test_train)
cv=CountVectorizer(max_features=30000)
x=cv.fit_transform(corpus).toarray()
#y_cat=y_cat[0:2000]
##################################################################################
####### spiltting
from sklearn.model_selection import train_test_split
示例#56
0
def data_cleaning():

    tokenized_collection = []
    #loop through all xml files
    for file_name in range(5000):
        e = xml.etree.ElementTree.parse('./data/' + str(file_name + 1) +
                                        '.xml').getroot()

        #check if empty
        if not e.text:
            tokenized_collection.append([])
        else:
            xml_text = word_tokenize(e.text)
            xml_text_stop = []

            #remove empty words, and remove punctuations
            xml_text_stop = list(filter(None, xml_text))
            for index, w in enumerate(xml_text_stop):
                xml_text_stop[index] = re.sub(r'[^\w\s]', '', w)

            #stop words processing
            xml_text_stop_fin = []
            stopWords = set(stopwords.words('english'))
            for w in xml_text_stop:
                if w not in stopWords:
                    xml_text_stop_fin.append(w)

            #stem words processing
            ps = PorterStemmer()
            xml_text_stop_fin = [ps.stem(a) for a in xml_text_stop_fin]

            #remove words with length lesser than 3
            xml_text_stop_fin = [
                word for word in xml_text_stop_fin if len(word) >= 3
            ]

            #remove duplicates
            xml_text_stop_cleaned = list(set(xml_text_stop_fin))

            #remove numbers and add hasNum feature
            for index, w in enumerate(xml_text_stop_cleaned):
                if str(w).isdigit():
                    del xml_text_stop_cleaned[index]
                    if 'hasNum' not in xml_text_stop_cleaned:
                        xml_text_stop_cleaned.append('hasNum')

            #print(xml_text_stop_fin)
            tokenized_collection.append(xml_text_stop_cleaned)

    #remove features that are mentioned less than 5 times in the dataset
    #create feature set
    feature_set = set()
    for item in tokenized_collection:
        for w in item:
            if w not in feature_set:
                feature_set.add(w)
    #lookup frequencies of occurences
    for feature in feature_set.copy():
        feature_count = 0
        for tokenized_collection_item in tokenized_collection:
            feature_count += tokenized_collection_item.count(feature)
        if feature_count < 5:
            for tokenized_collection_item_second in tokenized_collection:
                while feature in tokenized_collection_item_second:
                    tokenized_collection_item_second.remove(feature)
            feature_set.remove(feature)

    print(len(feature_set))

    #print(tokenized_collection)
    return feature_set, tokenized_collection
示例#57
0
tokens = preprocess(txt, lowercase=True)
punctuation = list(string.punctuation)
en_stop = get_stop_words('en')
stop = stopwords.words('english') + punctuation + [
    'http', 'html', 'com', ':/', 'rt', 'via', "https", "com"
]
terms_stop = [
    term for term in tokens
    if term not in stop and len(term) > 2 and not term in en_stop
]
stopped_tokens = [i for i in terms_stop if not i in en_stop]

# In[8]:

p_stemmer = PorterStemmer()
texts = [p_stemmer.stem(i) for i in terms_stop]
print len(texts)

# In[22]:

#with open('processed_texts.txt', 'w') as fp:
#        fp.write(str(texts))

# In[34]:

#f = open(mydir + "processed_texts.txt", 'r')
#texts = f.read()
#texts = texts.split()
#len(texts)
示例#58
0
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()

word = 'flying'
stemmed = stem.stem(word)

print(f'Lemmatized Word: {lem.lemmatize(word, "v")}')
print(f'Stemmed Word: {stemmed}')

示例#59
0
# In[5]:

from nltk.corpus import stopwords

sw = stopwords.words('english')
clean_tokens = [token for token in tokens if token not in sw]

# In[6]:

clean_tokens

# ['Citizens', 'India', 'known', 'Indians']

# In[7]:

from nltk.stem.porter import PorterStemmer

pstemmer = PorterStemmer()
[pstemmer.stem(token) for token in clean_tokens]

# ['citizen', 'india', 'known', 'indian']

# In[8]:

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(token) for token in clean_tokens]

#['Citizens', 'India', 'known', 'Indians']
示例#60
0
 def __init__(self, stemmer=PorterStemmer()):
     self._stemmer = stemmer