def __init__(self, localDownloadQueue="PendingDownloadQueue"): Base.__init__(self) self.download_queue = localDownloadQueue self.ftp_sync = FileSyncer() self.move_file_into_processing() Extractor(self.local_directory_to_sync) Cleaner(self.local_directory_to_sync)
def get_recommendations(self): cleaner = Cleaner() sg=SearchGenerator(self.url) words = self.dictionary.doc2bow(sg.get_cleancontent().split()) print("Top words identified: ") for word in words: print("{} {}".format(word[0], self.dictionary[word[0]])) query_vector = self.lda[words] sims = self.get_similarity(self.lda, query_vector) sims = sorted(enumerate(sims), key=lambda item: -item[1]) idx = 0 pids = [] result = 10 recommendation=[] page_ids = self.df['ID'].to_list() print("\nCheck out the links below:") while result > 0: pageid = page_ids[sims[idx][0]] if pageid not in pids: pids.append(pageid) print("{}".format(self.df[self.df['ID']==pageid]['URL'].values[0])) recommendation.append(self.df[self.df['ID']==pageid]['URL'].values[0]) result -= 1 idx += 1 return recommendation
def test_cleaner_age_valid_Int(self): clean = Cleaner() test_data = 99 expected_result = 99 actual_result = clean.Clean_Age(test_data)[0] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def test_cleaner_birthday_Invalid_3_response1(self): clean = Cleaner() test_data = "hello-break-me" expected_result = None actual_result = clean.Clean_Birthday(test_data)[0] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def get_text(self): words = self.plainTextEdit.toPlainText() cleaner = Cleaner() words2 = cleaner.edit_bulk_comments(words) for item in words2: self.textEdit.append(item)
def test_cleaner_birthday_Invalid_3_response2(self): clean = Cleaner() test_data = "23-11-99" expected_result = "The year needs to be in the full format eg: 2009" actual_result = clean.Clean_Birthday(test_data)[1] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def test_cleaner_birthday_valid_2(self): clean = Cleaner() test_data = "25-11-1991" expected_result = "25-11-1991" actual_result = clean.Clean_Birthday(test_data)[0] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def test_cleaner_age_invalid(self): clean = Cleaner() test_data = "nine" expected_result = None actual_result = clean.Clean_Age(test_data)[0] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def __init__(self, dirPath, binsNum): self.binsNum = binsNum self.dirPath = dirPath self.m_estimate = 2 self.loadStructure() try: self.df = pd.read_csv(self.dirPath + "/train.csv") except IOError: tkMessageBox.showerror( "Naive Bayes Classifier - Error", "There is a problem with open " + self.dirPath + "/train.csv") self.cleaner = Cleaner(self) self.naiveBases = {} #attributeValue and Classification to NaiveBase self.cProb = {} for (i, record) in self.df.iterrows(): recordDic = record.to_dict() for attribute in recordDic: value = recordDic[attribute] c = recordDic["class"] n_c = len(self.df.loc[((self.df[attribute] == value) & (self.df["class"] == c))].index) n = len(self.df.loc[(self.df["class"] == c)].index) m = self.m_estimate M = len(self.structure[attribute]) p = float(1) / M naiveBase = float(n_c + m * p) / (n + m) self.naiveBases[attribute + str(value) + c] = naiveBase for c in self.structure["class"]: self.cProb[c] = float( len(self.df.loc[(self.df["class"] == c)].index)) / len( self.df.index) tkMessageBox.showinfo("Naive Bayes Classifier - Success", "Building classifier using train-set is done!")
def read_emails(self, path): # Get all files files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('DS_Store')]) except: pass reader = WordListCorpusReader(path, files) cleaner = Cleaner() emails = list() # Creates the Email Object out of each email file and appends to list for file_id in reader.fileids(): with open(path + file_id, 'r') as current_file: cleaned_contents = cleaner.clean_file(current_file.read()) split_email_header, split_email_body, split_email_file_id = self.divide( cleaned_contents, file_id) emails.append( Email(split_email_header, split_email_body, split_email_file_id)) # Return list of Email objects return emails
def test_cleaner_bmi(self): clean = Cleaner() test_data = 'normal' expected_result = 'Normal' actual_result = clean.clean_bmi(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_bmi_2(self): clean = Cleaner() test_data = 'UNDERWEIGHT' expected_result = 'Underweight' actual_result = clean.clean_bmi(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_bmi_3(self): clean = Cleaner() test_data = 'overweight' expected_result = 'Overweight' actual_result = clean.clean_bmi(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_gender_4(self): clean = Cleaner() test_data = 'f' expected_result = 'F' actual_result = clean.clean_gender(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_empid_2(self): clean = Cleaner() test_data = 'a102' expected_result = 'A102' actual_result = clean.clean_empid(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_bmi_4(self): clean = Cleaner() test_data = 'OBEsity' expected_result = 'Obesity' actual_result = clean.clean_bmi(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
class Validator(object): clean = Cleaner() def val_empid(self, data): data = self.clean.clean_empid(data) if len(data) == 4: if data[0].isalpha(): pass for x in data[1]: if x.isdigit(): pass else: return False return True else: return False def val_gender(self, data): data = self.clean.clean_gender(data) if data == "M"or data == "F": return True else: return False def val_age(self, data): self.clean.clean_age(data) return True def Validate_Sales(self, Given_Sales): #check if the sales within range pattern = re.compile(r'\d{3}') if pattern.match(Given_Sales): return True else: ValueError as e return Given_Sales, e def val_bmi(self, data): data = self.clean.clean_bmi(data) if data == 'Normal' or data == 'Overweight' or data == 'Obesity' or data == 'Underweight': return True else: return False def Validate_Salary(self, Given_Salary): pattern = re.compile(r'[0-9]{2,3}') if pattern.match(Given_Salary): try: return True except ValueError as e: return Given_Salary, e def val_birthday(self, data): self.clean.clean_birthday(data) return True
def __init__(self, url): self.res_dict = { 'Title': [], 'Content': [], 'Title + Content': [], 'URL': [], 'ID': [] } self.url = url self.sg = SearchGenerator(self.url) self.search_terms = np.asarray(self.sg.extract_keywords()) self.df = '' self.cleaner = Cleaner()
def read_file_txt(self, all_my_employees): with open("test_data_txt.txt", "r") as file: data = file.readlines() clean = Cleaner() val = Validator() for line in data: valid = True emp = line.split(",") empid = clean.clean_empid(emp[0]) if val.val_empid(all_my_employees, empid)[0] == False: valid = False print("empid") gender = clean.clean_gender(emp[1]) if val.val_gender(gender)[0] == False: valid = False print("gender") age = clean.Clean_Age(emp[2]) if val.Validate_Age(age[0])[0] == False: valid = False print("age") sales = emp[3] bmi = clean.clean_bmi(emp[4]) if val.val_bmi(bmi)[0] == False: valid = False print("bmi") salary = emp[5] # there is an issue with the validation of the test data's birthdays birthday = clean.Clean_Birthday(emp[6]) # if val.Validate_Birthday(birthday, age[0]+ 1 )[0]: # pass # else: # valid = False # print("birthday") if valid != False: employee = Employee(empid, gender, age[0], sales, bmi, salary, birthday[0]) all_my_employees[empid] = employee else: print("Failed to add employee") return all_my_employees
def __init__(self, queryFile, queryJSON): #Initialize the cleaner object self._cleaner = Cleaner(" ", " ") #txt file in which all queries are stored self._qFile = queryFile #json file to store the queries after cleaning self._qJson = queryJSON #list to store raw queries self._queryList = list() #list to store refined queries self._queryDict = dict() #stopList self._stopList = list() #QueryID initialized to 1 self._qID = 1
class Controller(object): val = Validator() clean = Cleaner() def test_empid(self): data = "a001" print(self.clean.clean_empid(data)) print(self.val.val_empid(self.clean.clean_empid(data))) def test_gender(self): data = 'lbp' print(self.val.val_gender(data)) def test_bmi(self): data = 'normal' print(self.clean.clean_bmi(data)) print(self.val.val_bmi(self.clean.clean_bmi(data)))
def get_important_words(self, emails, path=None): cleaner = Cleaner() complete_email_text = '' for email in emails: email_header = cleaner.delete_tags(email.header) email_body = cleaner.delete_tags(email.body) topic_line = re.findall(r'Topic.*\n', email_header)[0] topic_line = topic_line[6:].strip() complete_email_text = complete_email_text + topic_line + '\n' + email_body + '\n' # Cleaning the text complete_email_text = re.sub('\n', ' ', complete_email_text) complete_email_text = re.sub('\s', ' ', complete_email_text) complete_email_text = re.sub(' +', ' ', complete_email_text) complete_email_text = tb(complete_email_text) bloblist = [complete_email_text] words = [] # Test # print(bloblist) for i, blob in enumerate(bloblist): scores = {word: self.tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words: words.append(word) # Delete Stop-Words words = self.delete_stopwords(words) if path is not None: with open(path, 'w') as current_file: for word in words: current_file.write('{}\n'.format(word)) return words
def XML_Reader(self, file_Location, error_File_Location, all_employees): # Add validate MyCleaner = Cleaner() my_Employees = {} tree = ET.parse(file_Location) root = tree.getroot() for user in root.findall('user'): # Need to Validate all given data empid = user.get('EMPID') gender = user.find('gender').text age = user.find('age').text sales = user.find('sales').text bmi = user.find('BMI').text salary = user.find('salary').text birthday = user.find('birthday').text #For each item in new Employee need to check that they have a value after being vlaidated new_Employee = Employee(MyCleaner.clean_empid(empid), MyCleaner.clean_gender(gender), MyCleaner.Clean_Age(age)[0], int(sales), MyCleaner.clean_bmi(bmi), int(salary.replace(',', '')), MyCleaner.Clean_Birthday(birthday)[0]) my_Employees[new_Employee.my_empid] = new_Employee # clean_Data = True # # for item in [empid, gender, age, sales, bmi, salary, birthday]: # if item[0] == False: # clean_Data = False # # if(clean_Data): # new_Employee = Employee(MyCleaner.clean_empid(empid), MyCleaner.clean_gender(gender), MyCleaner.Clean_Age(age), MyValidator.Validate_Sales(sales), MyCleaner.clean_bmi(bmi), MyValidator.Validate_Salary(salary), MyCleaner.Clean_Birthday(birthday)) # my_Employees[new_Employee.EMPID] = new_Employee # else: # #write to errorlogFile # pass return my_Employees
def evaluate_tags(gr_email, pred_email): cleaner = Cleaner() regex = { 'time': r'<[s|e]time>.*?</[s|e]time>', 'speaker': r'<speaker>.*?</speaker>', 'location': r'<location>.*?</location>', 'sentence': r'<sentence>.*?</sentence>', 'paragraph': r'<paragraph>.*?</paragraph>' } # gr_email tags ----------------- gr_email_header = gr_email.header gr_email_body = gr_email.body gr_email = gr_email_header + gr_email_body gr_email_tags = {} # Clean from 'newlines' gr_email = gr_email.replace('\n', '') for k in regex.keys(): gr_email_tags[k] = re.findall(regex[k], gr_email, re.MULTILINE) for i in range(0, len(gr_email_tags[k])): gr_email_tags[k][i] = cleaner.clean_file(gr_email_tags[k][i]) # pred_email tags ------------------- pred_email_header = pred_email.header pred_email_body = pred_email.body pred_email = pred_email_header + pred_email_body pred_email_tags = {} # Clean from 'newlines' pred_email = pred_email.replace('\n', '') for k in regex.keys(): pred_email_tags[k] = re.findall(regex[k], pred_email, re.M) for i in range(0, len(pred_email_tags[k])): pred_email_tags[k][i] = cleaner.clean_file(pred_email_tags[k][i]) tp = 0 fp = 0 fn = 0 # change gr_tags.keys() to ['key'] to evaluate a specific tag for k in gr_email_tags.keys(): gr = gr_email_tags[k] pred = pred_email_tags[k] # removing all punctuations and spaces from both email tag lists for i in range(0, len(gr)): gr[i] = re.sub(r'[^\w\s]', '', gr[i]) gr[i] = re.sub(' ', '', gr[i]) for i in range(0, len(pred)): pred[i] = re.sub(r'[^\w\s]', '', pred[i]) pred[i] = re.sub(' ', '', pred[i]) # Calculating TP, FP, FN for t in gr: # print(t) if t in pred: # print("Got here") tp = tp + 1 pred.remove(t) else: # print("Got here") fn = fn + 1 fp = fp + len(pred) return tp, fp, fn
from Analyzer import Analyzer from Cleaner import Cleaner from Cluster import Cluster from Crawler import Crawler from Uploader import Uploader this_date = time.strftime("%Y%m%d", time.localtime()) # 爬取新闻 crawler = Crawler(this_date=this_date) crawler.crawl() # 聚类 cluster = Cluster(date=this_date) cluster.remove_useless_articles() cluster.load_articles() cluster.cluster() cluster.upload_groups_to_DB() # 情绪分析 analyzer = Analyzer(date=this_date) analyzer.analyze() # 上传至LeanCloud uploader = Uploader(date=this_date) uploader.upload_new_groups() # 删除过老或分数过低的新闻组 cleaner = Cleaner(date=this_date) cleaner.clean()
categories = [ "alt.atheism", "soc.religion.christian", "sci.med", "comp.graphics" ] cate2 = [ "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware", "comp.windows.x" ] twenty_train = fetch_20newsgroups(subset="train", categories=cate2, shuffle=True) twenty_test = fetch_20newsgroups(subset="test", categories=cate2, shuffle=True) #cleaninng data set truck_cleaner = Cleaner() truck_cleaner.get_data_category_count(twenty_train) cleaner_text = truck_cleaner.text_header_remover(twenty_train.data) #preparing dataset import nltk #nltk.download('punkt') #nltk.download('stopwords') from gensim.models import Word2Vec from nltk.corpus import stopwords #import numpy as np def tokenizer_helper(cleaner_text_list): tokenize_sentences_list = [] for sentence in cleaner_text_list:
n_starting_triplets = 0 n_total_triplets = 0 files = [f for f in os.listdir("./processed")] for x in range(0,len(files)): if x > 1: print("completed: ", round((x * 100) / len(files),1), "% ", end='\r') file_name = os.path.join("./processed", files[x]) try: df = pd.read_csv(filepath_or_buffer = file_name, index_col=0, dtype = str, na_filter=False) # df = df.drop(["id_df"], axis=1) n_starting_triplets += len(df) cleaner = Cleaner(df, t5_tokenizer, stopwords, english_cache) cleaner.remove_non_marked() cleaner.clean_df() # final cleaning : remove methods which has more than one review cleaner.remove_multiple_method_comments() n_irrelevant_comments += cleaner.irrelevant_comments n_not_marked += cleaner.not_marked n_non_latin += cleaner.non_latin n_before_equals_after += cleaner.before_equals_after n_non_english += cleaner.non_english n_too_long += cleaner.too_long n_too_long_after += cleaner.too_long_after n_multiple_rev += cleaner.multiple_reviews
from argparse import ArgumentParser from Cleaner import Cleaner from Spanbert import SpanBert from entitycentric import entity_centric_segmentation cleaner = Cleaner() model = SpanBert() def write(f, text, tar, sent): f.write(text) f.write("\n") f.write(tar) f.write("\n") f.write(sent) f.write("\n") def segmentation(text, tar): text = text.replace("$T$", tar) text = cleaner.clean(text) clusters = model.predict(text) tokens = model.get_tokens() return entity_centric_segmentation(clusters, tokens, tar, min_dist=15, anaphora_only=False,
def get_cleantext(self, text): cleaner = Cleaner() cleaned = cleaner.clean_text(text) return cleaned
def testClean(): cleaner = Cleaner("Eli") assert cleaner.clean is "Eli is cleaning"