def file_processing(file,root,stop_words): p = PorterStemmer() with open(file) as f: length = len(f.readlines())-1 bar = IncrementalBar('In progress', max=length) with open(file, 'r') as csvFile: reader = csv.reader(csvFile) next(reader) for row ,i in zip(reader,range(1,length+1)): if not os.path.exists(root+row[1]): os.mkdir(root+row[1]) # Remove stop words first example = row[0] word_tokens = word_tokenize(example) filtered_sentence = [w for w in word_tokens if not w in stop_words] joined_sentence = (" ").join(filtered_sentence)+'\n' # Do stemming output = '' word = '' line = joined_sentence if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() path = root+row[1]+'/'+row[2]+'.txt' with open(path, "w") as cursor: # Write file cursor.write(output) bar.next() bar.finish()
class Process_text: def __init__(self, stop_list_filename): self.stop_list = map(str.rstrip, open(stop_list_filename, 'r').readlines()) # Removing \n at the end of each word self.stemmer = PorterStemmer() def _tokenize(self, text): return re.findall('\w+', text) def _stem(self, words_list): return map(lambda x: self.stemmer.stem(x, 0, len(x) - 1), words_list) def _remove_common_words(self, words_list): # return [word for word in words_list if word not in self.stop_list] # From https://gist.github.com/glenbot/4684356 # 2x as fast... But it's not a one-liner. stop_words = set(self.stop_list) for sw in stop_words.intersection(words_list): occurences = words_list.count(sw) for i in xrange(occurences): words_list.remove(sw) return words_list def _word_statistics(self, words_list): return Counter(words_list) def sanitize_rawtext(self, raw_text): return self._stem(self._remove_common_words(self._tokenize(raw_text.lower()))) def sanitize_rawtext_with_stats(self, raw_text): # One-liners FTW return self._word_statistics(self.sanitize_rawtext(raw_text))
def file_processing(file,stop_words): p = PorterStemmer() rows = [] with open(file, 'r') as csvFile: reader = csv.reader(csvFile) next(reader) for row in reader: # Remove stop words first example = row[1] word_tokens = word_tokenize(example) filtered_sentence = [w for w in word_tokens if not w in stop_words] joined_sentence = (" ").join(filtered_sentence)+'\n' # Do stemming output = '' word = '' line = joined_sentence if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() new_row=[] new_row.append(output.rstrip('\n')) new_row.append('?') rows.append(new_row) with open("new_test.csv", "w") as csvFile: # Write file csvwriter = csv.writer(csvFile) csvwriter.writerow(['text', 'class']) csvwriter.writerows(rows)
import math import sys import re from stemming import PorterStemmer p = PorterStemmer() # control values stem = False stopwords = False stopList = [] wordList = [] numberList = [] docInfo = {} documentList={} contextList = {} frequencyList = {} locationList = {} totalList = {} # check if number to remove numbers and titles def contains_digits(s): return any(char.isdigit() for char in s) # read doc def readDoc(doc, context): lists = doc.split(" ") for w in range(len(lists)): i = lists[w]
def __init__(self, stop_list_filename, format_type="vectorial"): self.stop_list = map(str.rstrip, open(stop_list_filename, "r").readlines()) self.stemmer = PorterStemmer() self.format_type = format_type
class Process_query: def __init__(self, stop_list_filename, format_type="vectorial"): self.stop_list = map(str.rstrip, open(stop_list_filename, "r").readlines()) self.stemmer = PorterStemmer() self.format_type = format_type def format_query(self, query): if self.format_type == "vectorial" or self.format_type == "probabilistic": return self._create_vectorial_query_from_string(query) elif self.format_type == "boolean": return self._create_boolean_query_from_json(query) else: raise ValueError("Unsupported query type!") def _create_vectorial_query_from_string(self, query_string): return self._vectorial_stem_elements_from_list( self._remove_common_words_from_list(re.findall("\w+", query_string.lower())) ) def _create_boolean_query_from_json(self, query_string): """ We only accept NDF queries, ie, a disjunction of conjunctions of terms (possibly negated with NOT). The valid accepted format is a string of NDF form. Examples: 'computer AND series OR NOT conclusion AND testing' 'study OR preprocessing' 'IBM AND simulation' Query will be processed by a stemmer and common words will be removed, so there is no need to put them into the query. Empty list queries or clauses will return nothing. For instance, [[], ['another', 'nonrational', 'model']] is equivalent to [['another', 'nonrational', 'model']], which, after stemming + common-words removal, will give [['nonrat', 'model']] """ query_list = self._byteify(map(lambda x: x.split(" and "), query_string.split(" or "))) if not self._check_valid_query(query_list): raise ValueError("The query does not have a valid format") return self._sanitize_boolean_query(query_list) def _byteify(self, input): """ Transforms unicode objects from JSON decode to UTF-8 ones. Copied from stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python#answer-13105359 """ if isinstance(input, dict): return {self._byteify(key): self._byteify(value) for key, value in input.iteritems()} elif isinstance(input, list): return [self._byteify(element) for element in input] elif isinstance(input, unicode): return input.encode("utf-8") else: return input def _check_valid_query(self, query_list): if type(query_list) is not list: return False for element in query_list: if type(element) is not list or not self._check_only_strings_in_list(element): return False return True def _check_only_strings_in_list(self, element_list): for element in element_list: if type(element) is not str: return False return True def _sanitize_boolean_query(self, query_list): # Stem the elements, remove the common ones # For speed reasons, first remove common words, then stem and remove common words return map( lambda element: self._boolean_stem_elements_from_list(self._boolean_remove_common_words_from_list(element)), query_list, ) def _boolean_remove_common_words_from_list(self, word_list): # print '_boolean_remove_common_words_from_list', word_list return [element for element in word_list if not self._boolean_should_delete(element)] def _vectorial_stem_elements_from_list(self, word_list): return map(lambda x: self.stemmer.stem(x, 0, len(x) - 1), word_list) def _boolean_stem_elements_from_list(self, word_list): # print '_boolean_stem_elements_from_list', word_list for i in xrange(len(word_list)): if self._is_real_word(word_list[i]): word_list[i] = self.stemmer.stem(word_list[i], 0, len(word_list[i]) - 1) else: word_list[i] = "not " + self.stemmer.stem(word_list[i][4:], 0, len(word_list[:4]) - 1) return word_list def _stem_elements_from_list(self, query_words): return map(lambda x: self.stemmer.stem(x, 0, len(x) - 1), query_words) def _remove_common_words_from_list(self, query_words): return [word for word in query_words if word not in self.stop_list] def _boolean_should_delete(self, element): if self._is_real_word(element): real_element = element else: real_element = element[4:] return real_element in self.stop_list def _is_real_word(self, element): return element[:4] != "not "
def __init__(self, stop_list_filename): self.stop_list = map(str.rstrip, open(stop_list_filename, 'r').readlines()) # Removing \n at the end of each word self.stemmer = PorterStemmer()