def create_vector(): web_pages_list = load_web_pages() stop_words_set = set(stopwords.words('english')) features = [] labels = [] idx = 0 for web_page in web_pages_list: try: idx += 1 print(idx) with open(web_page, 'r', encoding='utf8') as f: # clean page with pre-processor formatted_page = clean_page(f.read()) f.close() if 'non-profiles' in web_page: labels.append(0) # 0 for non profile pages elif 'profiles' in web_page: labels.append(1) features.append(formatted_page) except Exception as e: print(e) return features, labels
def calculate_relevance_score(page_text, module_name): keywords = [ "research interest", "research activity", "publication", "award", "academic", "record", "profile" ] # --Delete <header>, <footer>, <nav>, <script> from the body formatted_body = clean_page(page_text) bag_of_words = re.split(r" ", formatted_body) bag_of_words_count = len(bag_of_words) # --to calculate the keywords frequency in a given document re_keywords = re.compile('|'.join(keywords), re.I) keywords_count = len(re_keywords.findall(formatted_body)) # --to calculate the frequency of module name and its synonyms module_name_formatted = module_name.split(" ") re_module_name = re.compile('.'.join(module_name_formatted), re.I) module_name_count = len(re_module_name.findall(formatted_body)) onto = ITOntologyManager() synonyms = onto.get_synonyms(module_name) re_synonyms = re.compile('|'.join(synonyms), re.I) synonyms_count = len(re_synonyms.findall(formatted_body)) # --calculate relevance score keywords_frequency = keywords_count / float(bag_of_words_count) synonyms_frequency = synonyms_count / float(bag_of_words_count) module_name_frequency = module_name_count / float(bag_of_words_count) relevance_score = keywords_frequency + 10 * synonyms_frequency + 50 * module_name_frequency return relevance_score
def parse(self, response): # lecturer links soup = BeautifulSoup(response.text, 'lxml') print(response.url) synonyms = self.onto.get_synonyms(self.subject_name) self.subject_name = self.subject_name.replace("_", " ") synonyms.append(self.subject_name) count = 0 page_text = clean_page(response.text) page_lower = page_text.lower() re_in = ['research interest', 'research topics'] if any(x in page_lower for x in re_in): for word in synonyms: word = word.lower() count = count + page_lower.count(word) if count >= 1: item = LecturerPageItem() item['url'] = response.url item['subject'] = self.subject_name yield item
def check_page(page, module_name): onto = ITOntologyManager() synonyms = onto.get_synonyms(module_name) synonyms.append(module_name) count = 0 page_text = clean_page(page) page_lower = page_text.lower() for word in synonyms: count = count + page.count(word) # pattern = re.compile(r'(dr|mr|ms|mrs|prof)\.? ?((ms|mrs)\.?)? ?((\w\.)+)? ?(\w+){1,3}', re.I) soup = BeautifulSoup(page.lower(), 'lxml') re_interest = soup.find(text='res') parent_tag = None for i in range(0, 5): pass
def predict_web_page(self, page): formatted_page = clean_page(page) feature_tf = self.vectorizer.transform([formatted_page]) svm_probability = self.svm_classifier.predict_proba( feature_tf.toarray()) mnb_probability = self.mnb_classifier.predict_proba( feature_tf.toarray()) # print(svm_probability, mnb_probability) # predicted_result = self.mnb_classifier.predict(feature_tf.toarray()) # result = np.asscalar(np.int32(predicted_result[0])) result = self.weighted_mean_prediction(svm_probability[0, 1], mnb_probability[0, 1], svm_acc=0.8, mnb_acc=0.65) return result
def count_words(): web_pages_list = load_web_pages() stop_words_set = set(stopwords.words('english')) words_list = [] idx = 0 for web_page in web_pages_list: try: idx += 1 print(idx) with open(web_page, 'r', encoding='utf8') as f: # clean page with pre-processor formatted_page = clean_page(f.read()) f.close() # tokenize words with nltk tokenizer unfiltered_words = word_tokenize(formatted_page) # remove stop words, and non-alphabetic words from the bag of words for word in unfiltered_words: word = word.lower() if word.isalpha() and word not in stop_words_set: words_list.append(word) except Exception as e: print(e) bag_of_words = Counter(words_list) print('Words =', len(words_list), '| BoW =', len(bag_of_words)) print(bag_of_words.most_common()) # saving BoW with open('bag_of_words.mdl', 'wb') as f: _pickle.dump(bag_of_words, f) print("Saved the BoW") return bag_of_words.most_common(5000)
def create_dataset(bag_of_words): web_pages_list = load_web_pages() features_set = [] labels = [] # iterate the training dataset idx = 0 for web_page in web_pages_list: try: idx += 1 print(idx) with open(web_page, 'r', encoding='utf8') as f: # clean the web page formatted_page = clean_page(f.read()) f.close() # tokenize the page text formatted_page_lower = formatted_page.lower() words_list = word_tokenize(formatted_page_lower) word_occurrence = [] # calculate word occurrence in the page with bag of words for word_entry in bag_of_words: word_occurrence.append(words_list.count(word_entry[0])) # append the calculated word occurrence to features list features_set.append(word_occurrence) if 'non-profiles' in web_page: labels.append(0) # 0 for non profile pages elif 'profiles' in web_page: labels.append(1) # 1 for profile pages except Exception as e: print(e) return features_set, labels
svm_classifier = SVC(kernel='linear', C=2**1, probability=True) svm_classifier.fit(X_train_tf.toarray(), y_train) X_test_tf = vectorizer.transform(X_test) predictions = svm_classifier.predict(X_test_tf.toarray()) print(accuracy_score(y_test, predictions), "\n") mnb_classifier = MultinomialNB() mnb_classifier.fit(X_train_tf.toarray(), y_train) mnb_predictions = mnb_classifier.predict(X_test_tf.toarray()) print(accuracy_score(y_test, mnb_predictions)) while True: inp = input('>> ') if inp == "exit": break test_page = requests.get(inp) formatted_page = clean_page(test_page.content) feature_tf = vectorizer.transform([formatted_page]) pred = svm_classifier.predict(feature_tf.toarray()) mnb_pred = mnb_classifier.predict_proba(feature_tf.toarray()) print(svm_classifier.predict_proba(feature_tf.toarray()), "|", mnb_pred) print("Result: ", ['non-profile', 'profile'][pred[0]])