def _createFeatureVect(self, input): x = numpy.zeros(len(self._common_words)) for word in input.x.split(): word = PorterStemmer().stem(word.lower()) if word in self._common_words: x[self._common_words.index(word)] = 1 return x
def generate_hash_map(self): """ Method who creates a hash map with all word from current file parsed All words are firstly parsed to Porter Algorithm and then inserted to hash map :return: """ # clear the hash map self._hash_map.clear() for line in self._document_content: line = line.encode('utf-8') line = str(line).translate(PUNCTUATION_TRANS) words = line.split() for word in words: word = word.decode('utf-8-sig') word = PorterStemmer().stem(word) word = word.lower() if word.isalpha(): if not self._is_stop_word(word): # if the word is not in hash if word not in self._hash_map: self._hash_map[word] = 1 else: self._hash_map[word] += 1
def _findFeatureVector(self, trainingData): for i in range(0, trainingData.m): x = numpy.zeros(trainingData.n) for word in trainingData.TrainingData[i].x.split(): word = PorterStemmer().stem(word.lower()) if word in self._common_words: x[self._common_words.index(word)] = 1 self._X[i, :] = x
def findMostCommonWords(trainingData): words = dict() for input in trainingData.TrainingData: for word in input.x.split(): word = PorterStemmer().stem(word.lower()) if word in words: words[word] += 1 else: words[word] = 1 sorted_words = sorted(words.items(), key=operator.itemgetter(1), reverse=True) return [word[0] for word in sorted_words][:trainingData.n]
def pre_processing_line(self, line): line = cjson.decode(line) list_name = line['list_name'] #split TexasAggies ->Texas Aggies list_name = list_name.strip() name_split = re.split(r'\W+|_|\d+', list_name) #try: # prev_letter = text[0] #except: # print text new_list = [] for text in name_split: if text == '': continue words = '' try: prev_letter = text[0] except: print text word_length = len(text) words += prev_letter for i in range(1, len(text)): cur_letter = text[i] if cur_letter.isupper() and prev_letter.islower(): words += ' ' words += cur_letter prev_letter = cur_letter words = words.split(' ') for word in words: word = PorterStemmer().stem_word(word) new_list.append(word.lower()) ################################### #if no tag exist,neglect this line# ################################### if new_list == []: return 0 line['tag'] = [word for word in new_list if word not in self.stoplist_] #line['tag']=new_list #print new_list del line['list_name'] del line['_id'] #print line # del line['list_name'] # print line return line
def query_result(getter_address, search_query, max_distance): database_name = "givegetgreen_db" conn = sqlite3.connect(database_name) address_list = [] all_fields_list = [] hits = [] d = {} ix = open_dir("indexdir") f = open("search_results.txt", "a") query = "" search_list = search_query.split(" ") for words in search_list: words = PorterStemmer().stem(words.lower()) query = str(query) + str(words) + " OR " query = query + " " search_query = query # print search_query with ix.searcher() as searcher: query = MultifieldParser(["title", "category", "description"], schema=ix.schema).parse(search_query) # query = QueryParser("description" AND "title", ix.schema).parse(u'latest') results = searcher.search(query) # print results for words in results: f.write(str(words) + "\n") hits.append(str(words)) # print words.score, words['address'] address_list.append(words['address']) c = conn.cursor() x = int(words['id']) x = (x, ) # print x for row in c.execute('SELECT id FROM posting_posting where id= ? ', x): all_fields_list.append(row) getter_address = (getter_address.lower()) conn.commit() conn.close() return add_filter(getter_address, address_list, all_fields_list, max_distance)
def index(document_directory, dictionary_file): # preprocess docID list docID_list = [] i = 1 for doc in os.listdir(document_directory): docID_list.append([doc, i]) i += 1 f1 = open("output", "wb") pickle.dump(docID_list, f1) f1.close() stopwords = nltk.corpus.stopwords.words('english') docs_indexed = 0 # counter for the number of docs indexed dictionary = { } # key: term, value: docIDs containing term (incudes repeats) l_no = {} length = [] count = 0 cnt = 0 c1 = 0 word_positions = 0 # for each document in corpus for docID in docID_list: if (LIMIT and docs_indexed == LIMIT): break file_path = os.path.join(document_directory, str(docID[0])) # if valid document cnt += 1 line_no = 1 if (os.path.isfile(file_path)): file = codecs.open(file_path, encoding='utf-8', errors='ignore') line = file.readline() c = 0 # for line in document while line != '': # read entire document tokens = nltk.word_tokenize( line) # list of word tokens from document # for each term in document for word in tokens: word_positions += 1 word = PorterStemmer().stem(word) c += 1 count += 1 term = word.lower() # casefolding if (IGNORE_STOPWORDS and term in stopwords): continue # if ignoring stopwords if (IGNORE_NUMBERS and term.isnumeric()): continue # if ignoring numbers if (term[-1] == "'"): term = term[:-1] # remove apostrophe if (IGNORE_SINGLES and len(term) == 1): continue # if ignoring single terms if (len(term) == 0): continue # ignore empty terms # if term not already in dictionary if (term not in dictionary): dictionary[term] = [int( docID[1])] # define new term in in dictionary po = [line_no, word_positions] l_no[term] = [po] c1 += 1 # else if term is already in dictionary, append docID else: dictionary[term].append(docID[1]) po = [line_no, word_positions] l_no[term].append(po) line_no += 1 line = file.readline() docs_indexed += 1 length.append(c) file.close() f3 = open("len", "wb") pickle.dump(l_no, f3) f3.close() dict_file = codecs.open(dictionary_file, 'w', encoding='utf-8') dict_file.write(str(cnt) + '\n') ct = count / cnt fre = {} pr = 0 score = {} for term, docs in dictionary.items(): for x in range(cnt): freq = 0 n = 0 for t in dictionary[term]: if (x + 1 == t): freq += 1 if (pr != t and pr != 0): n += 1 pr = t sc = score_BM25(n, freq, cnt, length[x - 1], ct) if (term not in score): score[term] = [sc] # define new term in in dictionary else: score[term].append(sc) if (term not in fre): fre[term] = [freq] # define new term in in dictionary else: fre[term].append(freq) dict_file.write(term + " " + str(dictionary[term]) + " " + str(l_no[term]) + "\n") # close files dict_file.close() f2 = open("dict", "wb") pickle.dump(score, f2) f2.close() f4 = open("freq", "wb") pickle.dump(fre, f4) f4.close()
def stemming(word): word = PorterStemmer().stem_word(word.lower()) return word
def make_wordwheres(self): self.wordswhere = " TRUE " limits = [] if self.word_limits: """ This doesn't currently allow mixing of one and two word searches together in a logical way. It might be possible to just join on both the tables in MySQL--I'm not completely sure what would happen. But the philosophy has been to keep users from doing those searches as far as possible in any case. """ for phrase in self.limits['word']: locallimits = dict() array = phrase.split() for n, word in enumerate(array): searchingFor = word if self.word_field == "stem": from nltk import PorterStemmer searchingFor = PorterStemmer().stem_word(searchingFor) if self.word_field == "case_insensitive" or \ self.word_field == "Case_Insensitive": # That's a little joke. Get it? searchingFor = searchingFor.lower() selectString = "SELECT wordid FROM %s WHERE %s = %%s" % (self.wordsheap, self.word_field) logging.debug(selectString) cursor = self.db.cursor cursor.execute(selectString,(searchingFor,)) # Set the search key being used. search_key = "wordid" if self.gram_size() > 1: # 1-indexed entries in the bigram tables. search_key = "word{}".format(n + 1) for row in cursor.fetchall(): wordid = row[0] try: locallimits[search_key] += [wordid] except KeyError: locallimits[search_key] = [wordid] if len(locallimits) > 0: limits.append(where_from_hash(locallimits, comp = " = ", escapeStrings=False)) self.wordswhere = "(" + ' OR '.join(limits) + ")" if limits == []: # In the case that nothing has been found, tell it explicitly to search for # a condition when nothing will be found. self.wordswhere = "bookid = -1" wordlimits = dict() limitlist = copy.deepcopy(list(self.limits.keys())) for key in limitlist: if re.search("words\d", key): wordlimits[key] = self.limits[key] self.max_word_length = max(self.max_word_length, 2) del self.limits[key] if len(list(wordlimits.keys())) > 0: self.wordswhere = where_from_hash(wordlimits) return self.wordswhere