class NoiseFilter: """ NoiseFilter takes a list of Unidentified Terms (UTs), filters out noise (garbage) and returns a (noise)filtered list of UTs """ # Regexes to use as a filter ONLYNUM = "^[0-9]*$" SPECIAL = "[/_$&+,:;{}\"=?\[\]@#|~'<>^*()%!]" NON_ASCII = "[^\x00-\x7F]" PUNCT = "[.?\-\",]" CONSONANT_4 = "[bBcCdDfFgGhHjJkKlLmMnNpPqQrRsStTvVwWxXyYzZ]{4}" VOWEL_4 = "[aAeEiIoOuU]{4}" def __init__(self, args): self.args = args # define noisefilter arguments/parameter input self.ACTIONS = ('filter') # instance variables self.output_filename = "ut_filtered.txt" self.output_filename_regex = "ut_filtered_regex.txt" self.output_filename_idf = "ut_filtered_idf.txt" self.output_filename_noise = "ut_noise.txt" self.output_filename_noise_regex = "ut_noise_regex.txt" self.output_filename_noise_idf = "ut_noise_idf.txt" # global vars to store output of FilterNoise self.unfiltered_terms = [] self.filtered_terms_regex = [] self.filtered_terms_idf = [] self.noise_terms_regex = [] self.noise_terms_idf = [] self.combined_filtered_terms = [] self.combined_noise_terms = [] # parse noisefilter action if args is not None: if not args.action[0] in self.ACTIONS: error("Action not recognized, try: %s" % ', '.join(self.ACTIONS)) self.ACTION = args.action[0] else: print("No args supplied to NoiseFilter.") # Create an instance of the indexer self.indexer = Indexer() # Index the tweets self.indexer.LoadIndexes() def PerformAction(self): if self.ACTION == 'filter': self.FilterNoiseFromFile(self.args.file) def FilterNoiseFromFile(self, fname): # make sure file exists if not os.path.isfile(fname): error("Provided file does not exist?") unfiltered_terms = set() # read file and iterate over the lines with open(fname) as fd: lines = fd.readlines() for line in lines: term = line.strip().split('\t')[0] unfiltered_terms.add(term) self.FilterNoise(unfiltered_terms, self.args.idf_factor) with open(self.output_filename_regex, 'w') as outputfile_regex: for ut in self.filtered_terms_regex: outputfile_regex.write(ut + '\n') with open(self.output_filename_noise_regex, 'w') as outputfile_noise_regex: for nt in self.noise_terms_regex: outputfile_noise_regex.write(nt + '\n') with open(self.output_filename_idf, 'w') as outputfile_idf: for ut in self.filtered_terms_idf: outputfile_idf.write(ut + '\n') with open(self.output_filename_noise_idf, 'w') as outputfile_noise_idf: for nt in self.noise_terms_idf: outputfile_noise_idf.write(nt + '\n') with open(self.output_filename, 'w') as outputfile: for ut in self.combined_filtered_terms: outputfile.write(ut + '\n') with open(self.output_filename_noise, 'w') as outputfile_noise: for nt in self.combined_noise_terms: outputfile_noise.write(nt + '\n') def FilterNoise(self, unfiltered_input, idf_factor): self.unfiltered_terms = [] self.filtered_terms_regex = [] self.filtered_terms_idf = [] self.noise_terms_regex = [] self.noise_terms_idf = [] self.combined_filtered_terms = [] self.combined_noise_terms = [] for term in unfiltered_input: self.unfiltered_terms.append(term) # Applied filters: # 1. terms of 1 or 2 characters or terms larger than 10 characters # 2. terms containing non-ascii characters # 3. terms containing special characters # 4. terms consisting only of numbers # 5. terms having more punctuation than characters # 6. Four or more consecutive vowels, or five or more consecutive consonants. if len(term) < 3 or len(term) >= 7 \ or re.search(NoiseFilter.NON_ASCII, term) is not None \ or re.search(NoiseFilter.SPECIAL, term) is not None \ or re.search(NoiseFilter.ONLYNUM, term) is not None \ or len(re.findall(NoiseFilter.PUNCT, term)) > (len(term) - len(re.findall(NoiseFilter.PUNCT, term))) \ or re.search(NoiseFilter.VOWEL_4, term) is not None \ or re.search(NoiseFilter.CONSONANT_4, term) is not None: self.noise_terms_regex.append(term) else: self.filtered_terms_regex.append(term) # Get IDF term values. idf_base is the idf factor for terms that only appear once # in the whole collection. # Values lower than the idf_base can be a valid UTs, otherwise not idf = self.indexer.GetIDFForTerm(term) doccount = len(self.indexer.index_tweets) if doccount > 0: idf_base = math.log(float(doccount)) else: idf_base = 100.0 print("Tried to take the log of a <= 0 doccount! Was: ", doccount) threshold_idf = idf_factor * idf_base if idf <= threshold_idf: self.filtered_terms_idf.append(term) else: self.noise_terms_idf.append(term) self.combined_filtered_terms = intersect(self.filtered_terms_regex, self.filtered_terms_idf) self.combined_noise_terms = diff(self.unfiltered_terms, self.combined_filtered_terms) print('Input Terms: ' + str(len(self.unfiltered_terms))) print('Unidentified Terms Regex: ' + str(len(self.filtered_terms_regex))) print('Noisy Terms Regex: ' + str(len(self.noise_terms_regex))) print('Unidentified Terms IDF: ' + str(len(self.filtered_terms_idf))) print('Noisy Terms IDF: ' + str(len(self.noise_terms_idf))) print('Combined Unidentified Terms: ' + str(len(self.combined_filtered_terms))) print('Combined Noisy Terms: ' + str(len(self.combined_noise_terms))) # This is the list we use as a result. return self.combined_filtered_terms