def featurize(review): featurized_review = defaultdict(int) # bag_of_words(featurized_review, review) # bigrams(featurized_review, review) # stars(featurized_review, review) stem(featurized_review, review) # pos_tag(featurized_review, review) # preprocessed_bag_of_words(featurized_review, review) # service_vs_food(featurized_review, review) return featurized_review
def format(line):#-------------------------------------------------------FORMAT stopwords = nltk.corpus.stopwords.words('english') #list of stopwords useless = ["would", "could", "in", "use"]; real = list() listOfWords = [] text = line["content"] text = text.str.strip('"') #eliminate quotes text = text.str.split() #cut stop words for word in text: for realWord in word: listOfWords.append(realWord.lower()) real += [word for word in listOfWords if word not in stopwords and word not in useless] text = sorted(real) if(VERSION < 4): line[1] = lem(line[1]) line[1] = stem(line[1]) text = lem(text) return text
def get_vocabulary(): training_files = os.listdir(os.getcwd() + "/train/spam") for i in range(len(training_files)): training_files[i] = os.getcwd() + "/train/spam/" + training_files[i] training_ham = os.listdir(os.getcwd() + "/train/ham/") for i in range(len(training_ham)): training_ham[i] = os.getcwd() + "/train/ham/" + training_ham[i] training_files.extend(training_ham) v_word = list() for f_name in training_files: #print('a') if f_name.split('.')[-1] == "txt": #print('b') f = open(f_name, "r") #print(f_name) #print(f.read()) lines = stem(f.read()) words = set(lines) #print(words) for w in words: if not (w in v_word): v_word.append(w) return v_word
def train(): data = {} total_spam = 0 total_ham = 0 nof_spam = 0 nof_ham = 0 for filename in os.listdir("./ham_train"): file = open('./ham_train' + '//' + filename, errors='ignore') wordcount = Counter(file.read().split()) for item in wordcount.items(): if item[0] in data: data[item[0]][1] += item[1] else: data[item[0]] = [0, item[1]] total_ham += item[1] nof_ham += 1 for filename in os.listdir('./spam_train'): file = open('./spam_train' + '//' + filename, errors='ignore') wordcount = Counter(file.read().split()) for item in wordcount.items(): if item[0] in data: data[item[0]][0] += item[1] else: data[item[0]] = [item[1], 0] total_spam += item[1] nof_spam += 1 data = stem(data) return data
def test(): data = {} total_spam = 0 total_ham = 0 nof_spam = 0 nof_ham = 0 for filename in os.listdir('./ham_test'): file = open('./ham_test' + '//' + filename, errors='ignore') wordcount = Counter(file.read().split()) for item in wordcount.items(): if item[0] in data: data[item[0]][1] += item[1] else: data[item[0]] = [0, item[1]] total_ham += item[1] nof_ham += 1 for filename in os.listdir('./spam_test'): file = open('./spam_test' + '//' + filename, errors='ignore') wordcount = Counter(file.read().split()) for item in wordcount.items(): if item[0] in data: data[item[0]][0] += item[1] else: data[item[0]] = [item[1], 0] total_spam += item[1] nof_spam += 1 #print(nof_spam) data = stem(data) data_new, ps, ph = cond.Prob(data, total_spam, total_ham, nof_spam, nof_ham) return data_new, ps, ph
def clean(link): """ Apply every operations on a filename """ text = get_Article(link) tokens = tokenize(text) stemmed_token = stem(tokens) big_words = particle_removal(stemmed_token) print(big_words)
def stem(tokens, stemmer): if stemmer == 'porter': stemmer = PorterStemmer() tokens = [stemmer.stem(i) for i in tokens] elif stemmer == 'porter2': tokens = [stem(i) for i in tokens] elif stemmer == 'lemma': lemmatiser = WordNetLemmatizer() tokens = [lemmatiser.lemmatize(i) for i in tokens] return tokens
def evaluate(m, f_name, train_flag, expected): #tokens = stem(open(f_name).read()) text = stem(open(f_name, "r").read()) tc = list() for p in string.punctuation: while p in text: text[text.index(p)] = ' ' text = "".join("".join(text).split()) summed = 0.0 for i in range(len(m[0])): temp = text.count(m[0][i]) summed = summed + (m[1][i] * temp) #if temp > 0.0: #print(temp) if train_flag: tc.append(temp) summed = summed + m[1][-1] #print("Sum: " + str(summed)) #print(model[0][0]) #print(str(spam_score) + str(ham_score)) if train_flag: tc.append(1) result = 0 if summed >= 0.0: result = 1 else: result = -1 if train_flag: for i in range(len(m[1])): #temp = m[1][i] m[1][i] = m[1][i] + (learning_rate * tc[i] * (expected - result)) #if temp - m[1][i] < -0.0001: # print("UP (" + str(temp) + ", " + str(m[1][i]) + ")") #elif temp - m[1][i] > 0.0001: # print("DOWN (" + str(temp) + ", " + str(m[1][i]) + ")") #else: #print(summed) return result
def pre_process(text): """ Preprocesses a given document by removing apostrophes, tokenizing, which includes the removal of punctuation, lowercasing and stemming with the PorterStemmer. :param text: String The text which should be pre processed. :return: String The pre processed text in the form a whitespace delimited String containing the processed words in original order. """ text = stem(remove_stop_words(to_lower(tokenize( replace_apostrophe(text))))) return ' '.join(text)
def bag_of_words(phrase): sents = nltk.tokenize.sent_tokenize(phrase) words = [] for sent in sents: words += nltk.tokenize.word_tokenize(sent) swords = unstop(stem(words)) bag = {} for word in swords: if word in bag: bag[word] += 1 else: bag[word] = 1 return bag
def useModel(modelPath, test): stemmed_no_stopwords_test = stem(removeStopwords(test)) with open(modelPath, 'rb') as fid: model = pickle.load(fid) predicted = model.predict(stemmed_no_stopwords_test.data) print("My Best Configuration") print( metrics.classification_report( stemmed_no_stopwords_test.target, predicted, target_names=stemmed_no_stopwords_test.target_names)) print("Macro Average F1: " + str( metrics.f1_score( stemmed_no_stopwords_test.target, predicted, average='macro')) + "\n")
def readFiles(fileName): docs = {} f = open(fileName) count = 0 for line in f.readlines(): count += 1 if count % 2 == 0: doc = nltk.word_tokenize(line.strip()) if isLowerCase: doc = lowerCase(doc) if isStem: doc = stem(doc) if isRemoveStopWords: doc = removeStopWords(doc) if isRemovePunctuation: doc = removePunctuation(doc) if not isUnigram: doc = bigram(doc) docs[count / 2] = doc return docs
def trainModel(training, name): stemmed_no_stopwords = stem(removeStopwords(training)) # Removed Stopwords + Stemmed Training Data print("Removed Stopwords + Porter Stemmed") classifier = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) classifier = classifier.fit(stemmed_no_stopwords.data, stemmed_no_stopwords.target) # Save the classifier with open(name, 'wb') as fid: pickle.dump(classifier, fid)
def preprocess(data): stemmer = PorterStemmer() wrds = stem(stemmer, data['text'].lower().split(" ")) pol = data['pol'] return (wrds, pol)
def train_clean(self, X, y): cleaned_x = [] cleaned_y = [] num_utter = len(X) token_mismatch_before = [] token_mismatch_after = [] blank_utterances = [] for id in xrange(num_utter): utter_x = [] utter_y = [] utter_labels_tokens = y[id].split() utter_tokens = X[id].split() try: assert (len(utter_tokens) == len(utter_labels_tokens)) except: token_mismatch_before.append(id + 1) continue for i in xrange(len(utter_labels_tokens)): if not ((utter_labels_tokens[i] in self.kick_labels) or (utter_tokens[i] in self.kick_tokens) or utter_tokens[i].isdigit()): try: utter_token = utter_tokens[i] if self.stemm: utter_token = stem(utter_token) if self.lem: wordnet_lemmatizer = WordNetLemmatizer() utter_token = wordnet_lemmatizer.lemmatize( utter_token) try: utter_token_label = self.label_dict[ utter_labels_tokens[i]] except: utter_token_label = self.failsafe_label unicode_string = utter_token.decode("utf-8") token_corrected = self.remove_accents(unicode_string) utter_x.append(token_corrected) utter_y.append(utter_token_label) except: continue sentence_x = (' '.join(utter_x)) sentence_x = re.sub(r'[@#;,"().*!?:\/\\-]', '', sentence_x) sentence_x = re.sub(r'[_\']', '', sentence_x) tokens_x = sentence_x.split() tokens_y = utter_y if len(utter_x) == 0: blank_utterances.append(id + 1) continue if len(tokens_x) != len(tokens_y): token_mismatch_after.append(id + 1) continue cleaned_x.append(tokens_x) cleaned_y.append(tokens_y) print "Unicode errors...corrected\n" \ "Token Mismatch Errors :...Skipped \n" \ "%d before;%d after" % (len(token_mismatch_before),len(token_mismatch_after)) self.log_mismatches(token_mismatch_before, token_mismatch_after, blank_utterances) if self.correct_spellings: # may have become buggy cleaned_x = [str(' '.join(x)) for x in cleaned_x] cleaned = [cleaned_x, cleaned_y] ets = engTextSeparate(cleaned) ets_cacs = ets.cacs() cleaned_x = ets_cacs return cleaned_x, cleaned_y # list of list of tokens/labels
def ler_arquivo_clg(): logging.info("Program started!") #iniciando array de palavras vs documents words_documents = [] #lendo o arquivo com os leia e saida config = configparser.RawConfigParser(strict=False,dict_type=MultiOrderedDict) logging.info("Reading GLI.CFG") config.read(['GLI.CFG']) entradas = config.get("DEFAULT","LEIA"); saida = config.get("DEFAULT", "ESCREVE"); stemmer_config = config.get("DEFAULT", "STEMMER"); if (stemmer_config[0] == 'true'): stemmer = 1 else: stemmer = 0 logging.info("GLI.CFG has been read") logging.info("Reading cfc-2.dtd") # parte de ler o xml usando o dtd f = codecs.open('db\cfc-2.dtd') dtd = ET.DTD(f) logging.info("cfc-2.dtd read") logging.info("Starting reading xml") begin_time = time.perf_counter() for entrada in entradas: #print("printando a entrada " + entrada) logging.info("Reading " + entrada + " xml file") root = ET.parse(entrada) if(dtd.validate(root)): xmldoc = minidom.parse(entrada) itemlist = xmldoc.getElementsByTagName('RECORD') for s in itemlist: recordnum = s.getElementsByTagName('RECORDNUM') recordnum = int(recordnum[0].firstChild.nodeValue) abstract = s.getElementsByTagName('ABSTRACT') if(len(abstract) > 0): text_to_parse = abstract[0].firstChild.nodeValue else: extract = s.getElementsByTagName('EXTRACT') if(len(extract) > 0): text_to_parse = extract[0].firstChild.nodeValue else: continue text_to_parse = text_to_parse.upper() text_to_parse = re.sub('[^A-Z\ \']+', " ", text_to_parse) text_words = text_to_parse.split() for word in text_words: word_found = False for wd in words_documents: if (stemmer == 0): if (wd.word == word): wd.documents.append(recordnum) word_found = True break else: if (wd.word == nltk.stem(word)): wd.documents.append(recordnum) word_found = True break if (word_found == False): if (stemmer == 0): w = word_document(word) w.documents.append(recordnum) words_documents.append(w) else: w = word_document(nltk.stem(word)) w.documents.append(recordnum) words_documents.append(w) #print(s.attributes['RECORDNUM'].value) else: logging.info(entrada + " xml file didn't pass on dtd validation") #print(dtd.error_log.filter_from_errors()) end_time = time.perf_counter() total_time = end_time - begin_time logging.info("Inverted list created a list with " + str(len(words_documents)) + " words") logging.info("Inverted list made " + str(len(words_documents) / total_time) + " words per second") logging.info("Inverted list made " + str(len(entradas) / total_time) + " documents per second") logging.info("Writing on csv") with open(saida[0], 'w',newline='') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) for wd in words_documents: spamwriter.writerow([wd.word,wd.documents]) logging.info("Finished!")
# Use BeautifulSoup to parse soup = BeautifulSoup(inf, 'html.parser') # Get the Body body = get_body(soup) # print 'Body: ', body # Get the Title title = get_title(soup) # print 'Title: ', title # Get the Headers headers = get_headers(soup) # Get the Bold bolds = get_bold(soup) # Tokenize via NLTK title_tokens = stem(tokenize(title)) if not title_tokens: title_tokens = list() body_tokens = stem(tokenize(body)) if not body_tokens: body_tokens = list() header_tokens = stem(tokenize(headers)) if not header_tokens: header_tokens = list() bold_tokens = stem(tokenize(bolds)) if not bold_tokens: bold_tokens = list() all_tokens = clean_up(remove_unwanted(title_tokens + body_tokens)) strong_tokens = clean_up( remove_unwanted(title_tokens + header_tokens + bold_tokens))
def getDiseaseFromSymptom(message, number): user_input = message letters_only = re.sub("[^a-zA-Z]", " ", user_input) lower_case = letters_only.lower() words = lower_case.split() words = [w for w in words if not w in stopwords.words("english")] stemmed_words = [stem(word) for word in words] val = getDiseaseFromLocalValues(stemmed_words, number) if val != "": return val symptoms_having_ids = [ 'dizzi', 'weight', 'tired', 'feel', 'heartburn', 'back', 'menstruat', 'paralysi', 'skin', 'stomach', 'cold', 'miss', 'sleepless', 'eye', 'droop', 'earach', 'memori', 'nervous', 'hot', 'chest', 'lip', 'nausea', 'earli', 'headach', 'fever', 'reduc', 'itch', 'swollen', 'burn', 'weak', 'stuffi', 'sneez', 'sore', 'hiccup', 'vomit', 'wheez', 'fast,', 'increas', 'tremor', 'cough', 'runni', 'chill', 'palpit', 'short', 'neck', 'sputum', 'tear', 'abdomin', 'cheek', 'dri', 'anxieti', 'sweat', 'night', 'unconsciousness,' ] symptom_to_id = { 'dizzi': 207, 'weight': 23, 'tired': 16, 'feel': 76, 'heartburn': 45, 'back': 104, 'menstruat': 112, 'paralysi': 140, 'skin': 124, 'stomach': 179, 'sweat': 139, 'sleep': 52, 'eye': 33, 'droop': 244, 'earach': 87, 'memori': 235, 'nervous': 114, 'chest': 17, 'lip': 35, 'nausea': 44, 'earli': 92, 'headach': 9, 'fever': 11, 'appetit': 54, 'itch': 96, 'swollen': 169, 'burn': 46, 'weak': 56, 'stuffi': 28, 'sneez': 95, 'sore': 13, 'hiccup': 122, 'vomit': 181, 'wheez': 30, 'thirst': 40, 'tremor': 115, 'cough': 15, 'runni': 14, 'chill': 175, 'palpit': 37, 'neck': 136, 'sputum': 64, 'tear': 211, 'abdomin': 10, 'cheek': 170, 'dri': 273, 'anxieti': 238, 'sweat': 138, 'night': 133, 'unconsciousness,': 144 } the_real_symptoms_with_ids = list( set(symptoms_having_ids).intersection(stemmed_words)) print(the_real_symptoms_with_ids) ids = [] for i in the_real_symptoms_with_ids: ids.append(str(symptom_to_id[i])) return getPotentialDiseasesFromIds(ids, number)
def tokenize_and_stem(text, stemmer="lemma", is_english_word=None): if is_english_word == None: is_english_word = load_from_dictionary("english") tokenizer = nltk.RegexpTokenizer(r'\w+') return stem(tokenize(text, is_english_word), stemmer)
def explorationResults(training, test): # Process training data no_stopwords = removeStopwords(training) stemmed = stem(training) stemmed_no_stopwords = stem(removeStopwords(training)) # Process test data no_stopwords_test = removeStopwords(test) stemmed_test = stem(test) stemmed_no_stopwords_test = stem(removeStopwords(test)) # Unigram Baseline print("Unigram Baseline") classify( training, test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Removed Stopwords Training Data print("Removed Stopwords") classify( no_stopwords, no_stopwords_test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Stemmed Training Data print("Porter Stemmed") classify( stemmed, stemmed_test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Removed Stopwords + Stemmed Training Data print("Removed Stopwords + Porter Stemmed") classify( stemmed_no_stopwords, stemmed_no_stopwords_test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Univariate Feature Selection print("Univariate Feature Selection") classify( training, test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('selector', SelectPercentile(chi2, 25)), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # L2 Regularization print("L2 Regularization") t = copy.copy(training) classify( training, test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('selector', SelectFromModel(LinearSVC(penalty="l2"))), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # L2 Regularization + Univariate Feature Selection print("L2 Regularization + Univariate Feature Selection") t = copy.copy(training) classify( training, test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('univariate', SelectFromModel(LinearSVC(penalty="l2"))), ('L2', SelectPercentile(chi2, 25)), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Univariate Feature Selection + L2 Regularization print("Univariate Feature Selection + L2 Regularization") t = copy.copy(training) classify( training, test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('L2', SelectPercentile(chi2, 25)), ('univariate', SelectFromModel(LinearSVC(penalty="l2"))), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Removed Stopwords + Univariate Feature Selection print("Removed Stopwords + Univariate Feature Selection") classify( no_stopwords, no_stopwords_test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('selector', SelectPercentile(chi2, 25)), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Removed Stopwords + L2 Regularization print("Removed Stopwords + L2 Regularization") classify( no_stopwords, no_stopwords_test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('selector', SelectFromModel(LinearSVC(penalty="l2"))), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Stemmed Training Data + Univariate Feature Selection print("Porter Stemmed + Univariate Feature Selection") classify( stemmed, stemmed_test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('selector', SelectPercentile(chi2, 25)), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Stemmed Training Data + L2 Regularization print("Porter Stemmed + L2 Regularization") classify( stemmed, stemmed_test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('selector', SelectFromModel(LinearSVC(penalty="l2"))), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), []) # Stemmed Training Data + Removed Stopwords + Univariate Feature Selection print("Porter Stemmed + Removed Stopwords + Univariate Feature Selection") classify( stemmed_no_stopwords, stemmed_no_stopwords_test, Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('selector', SelectPercentile(chi2, 25)), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]), [])