Exemplo n.º 1
0
from porter_stemer import PorterStemmer
words = 'it\'s those dogs i liked'
stem = PorterStemmer()
print stem.stem(words, 0, len(words)-1)

Exemplo n.º 2
0
 def __init__(self):
     self.stemmer = PorterStemmer()
     self.STOP_WORDS_FILE = '%s/../Data/english.stop' % os.path.dirname(
         os.path.realpath(__file__))
     with codecs.open(self.STOP_WORDS_FILE, 'r') as f:
         self.stopwords = f.read().split()
Exemplo n.º 3
0
class LSA(object):

    stemmer = None
    stopwords = []

    def __init__(self):
        self.stemmer = PorterStemmer()
        self.STOP_WORDS_FILE = '%s/../Data/english.stop' % os.path.dirname(
            os.path.realpath(__file__))
        with codecs.open(self.STOP_WORDS_FILE, 'r') as f:
            self.stopwords = f.read().split()

    def remove_stop_wrods(self, word_list):
        return [word for word in word_list if word not in self.stopwords]

    def _pre_solve_1(self, sentence):
        sentence = sentence.replace(unicode('’', 'utf-8'), '\'')
        return sentence.lower()

    def _pre_solve_stemmer(self, sentence):
        words_list = []
        for w in sentence.split(' '):
            words_list.append(self.stemmer.stem(w, 0, len(w) - 1))
        return ' '.join(words_list)

    def _pre_solve_filter_stopwords(self, sentence):
        word_list = sentence.split(' ')
        return ' '.join(self.remove_stop_wrods(word_list))

    def pre_solve(self, sentence):
        sentence = self._pre_solve_1(sentence)
        sentence = self._pre_solve_stemmer(sentence)
        return self._pre_solve_filter_stopwords(sentence)

    def get_documents(self, file_path):
        documents = []
        documents_words = []
        line_num = 0
        with codecs.open(file_path, 'r') as fopen:
            fopen.readline()
            line_num += 1
            while fopen:
                line = fopen.readline()
                #print 'line_num:', line_num
                line_num += 1
                if len(line) == 0:
                    break
                line = line.strip().split('	')
                #print 'line[3] = ', line[3]
                #pre solve
                sentence = self.pre_solve(line[3])
                documents_words.append(sentence.split(' '))
                documents.append(sentence)
        return documents, documents_words

    def main(self,
             query,
             target_id,
             sorurce_file_path,
             documents,
             documents_words,
             num_topics=20):
        #documents, documents_words = self.get_documents(sorurce_file_path)
        dictionary = corpora.Dictionary(documents_words)
        corpus = [
            dictionary.doc2bow(documents_word)
            for documents_word in documents_words
        ]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]

        lsi = models.LsiModel(corpus_tfidf,
                              id2word=dictionary,
                              num_topics=num_topics)
        corpus_lsi = lsi[corpus_tfidf]
        index = similarities.MatrixSimilarity(lsi[corpus])
        query = self.pre_solve(query)
        query_word_list = query.split(' ')
        query_bow = dictionary.doc2bow(query_word_list)
        query_lsi = lsi[query_bow]

        sims = list(index[query_lsi])
        #print 'sims[', target_id, ']=', sims[target_id]
        return sims[target_id]
Exemplo n.º 4
0
	def __init__(self):
		self.stemmer = PorterStemmer()
		self.STOP_WORDS_FILE = '%s/../Data/english.stop' %  os.path.dirname(os.path.realpath(__file__))
		with codecs.open(self.STOP_WORDS_FILE, 'r') as f:
			self.stopwords = f.read().split()
Exemplo n.º 5
0
class LSA(object):
	
	stemmer = None
	stopwords = []
	
	def __init__(self):
		self.stemmer = PorterStemmer()
		self.STOP_WORDS_FILE = '%s/../Data/english.stop' %  os.path.dirname(os.path.realpath(__file__))
		with codecs.open(self.STOP_WORDS_FILE, 'r') as f:
			self.stopwords = f.read().split()

	def remove_stop_wrods(self, word_list):
		return [word for word in word_list if word not in self.stopwords]

	def _pre_solve_1(self, sentence):
		sentence = sentence.replace(unicode('’', 'utf-8'), '\'')
		return sentence.lower()

	def _pre_solve_stemmer(self, sentence):
		words_list = []
		for w in sentence.split(' '):
			words_list.append(self.stemmer.stem(w, 0, len(w)-1))
		return ' '.join(words_list)

	def _pre_solve_filter_stopwords(self, sentence):
		word_list = sentence.split(' ')
		return ' '.join(self.remove_stop_wrods(word_list))

	def pre_solve(self, sentence):
		sentence = self._pre_solve_1(sentence)
		sentence = self._pre_solve_stemmer(sentence)
		return self._pre_solve_filter_stopwords(sentence)

	def get_documents(self, file_path):
		documents = []
		documents_words = []
		line_num = 0
		with codecs.open(file_path, 'r') as fopen:
			fopen.readline()
			line_num += 1
			while fopen:
				line = fopen.readline()
				#print 'line_num:', line_num
				line_num += 1
				if len(line) == 0:
					break
				line = line.strip().split('	')
				#print 'line[3] = ', line[3]
				#pre solve
				sentence = self.pre_solve(line[3])
				documents_words.append(sentence.split(' '))
				documents.append(sentence)
		return documents, documents_words

	def main(self, query, target_id, sorurce_file_path, documents, documents_words, num_topics=20):
		#documents, documents_words = self.get_documents(sorurce_file_path)
		dictionary = corpora.Dictionary(documents_words)
		corpus = [dictionary.doc2bow(documents_word) for documents_word in documents_words]
		tfidf = models.TfidfModel(corpus)
		corpus_tfidf = tfidf[corpus]

		lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
		corpus_lsi = lsi[corpus_tfidf]
		index = similarities.MatrixSimilarity(lsi[corpus])
		query = self.pre_solve(query)
		query_word_list = query.split(' ')
		query_bow = dictionary.doc2bow(query_word_list)
		query_lsi = lsi[query_bow]

		sims = list(index[query_lsi])
		#print 'sims[', target_id, ']=', sims[target_id]
		return sims[target_id]
Exemplo n.º 6
0
from porter_stemer import PorterStemmer
words = 'it\'s those dogs i liked'
stem = PorterStemmer()
print stem.stem(words, 0, len(words) - 1)