Exemplo n.º 1
0
def create_vector():
    web_pages_list = load_web_pages()
    stop_words_set = set(stopwords.words('english'))
    features = []
    labels = []
    idx = 0
    for web_page in web_pages_list:
        try:
            idx += 1
            print(idx)
            with open(web_page, 'r', encoding='utf8') as f:
                # clean page with pre-processor
                formatted_page = clean_page(f.read())
                f.close()

            if 'non-profiles' in web_page:
                labels.append(0)  # 0 for non profile pages
            elif 'profiles' in web_page:
                labels.append(1)

            features.append(formatted_page)
        except Exception as e:
            print(e)

    return features, labels
Exemplo n.º 2
0
def calculate_relevance_score(page_text, module_name):
    keywords = [
        "research interest", "research activity", "publication", "award",
        "academic", "record", "profile"
    ]

    # --Delete <header>, <footer>, <nav>, <script> from the body
    formatted_body = clean_page(page_text)
    bag_of_words = re.split(r" ", formatted_body)
    bag_of_words_count = len(bag_of_words)

    # --to calculate the keywords frequency in a given document
    re_keywords = re.compile('|'.join(keywords), re.I)
    keywords_count = len(re_keywords.findall(formatted_body))

    # --to calculate the frequency of module name and its synonyms
    module_name_formatted = module_name.split(" ")
    re_module_name = re.compile('.'.join(module_name_formatted), re.I)
    module_name_count = len(re_module_name.findall(formatted_body))

    onto = ITOntologyManager()
    synonyms = onto.get_synonyms(module_name)
    re_synonyms = re.compile('|'.join(synonyms), re.I)
    synonyms_count = len(re_synonyms.findall(formatted_body))

    # --calculate relevance score
    keywords_frequency = keywords_count / float(bag_of_words_count)
    synonyms_frequency = synonyms_count / float(bag_of_words_count)
    module_name_frequency = module_name_count / float(bag_of_words_count)

    relevance_score = keywords_frequency + 10 * synonyms_frequency + 50 * module_name_frequency
    return relevance_score
Exemplo n.º 3
0
    def parse(self, response):
        # lecturer links
        soup = BeautifulSoup(response.text, 'lxml')

        print(response.url)

        synonyms = self.onto.get_synonyms(self.subject_name)
        self.subject_name = self.subject_name.replace("_", " ")
        synonyms.append(self.subject_name)

        count = 0
        page_text = clean_page(response.text)
        page_lower = page_text.lower()
        re_in = ['research interest', 'research topics']

        if any(x in page_lower for x in re_in):
            for word in synonyms:
                word = word.lower()
                count = count + page_lower.count(word)

        if count >= 1:
            item = LecturerPageItem()
            item['url'] = response.url
            item['subject'] = self.subject_name
            yield item
Exemplo n.º 4
0
def check_page(page, module_name):
    onto = ITOntologyManager()
    synonyms = onto.get_synonyms(module_name)
    synonyms.append(module_name)

    count = 0
    page_text = clean_page(page)
    page_lower = page_text.lower()
    for word in synonyms:
        count = count + page.count(word)

    # pattern = re.compile(r'(dr|mr|ms|mrs|prof)\.? ?((ms|mrs)\.?)? ?((\w\.)+)? ?(\w+){1,3}', re.I)

    soup = BeautifulSoup(page.lower(), 'lxml')
    re_interest = soup.find(text='res')

    parent_tag = None

    for i in range(0, 5):
        pass
Exemplo n.º 5
0
    def predict_web_page(self, page):
        formatted_page = clean_page(page)
        feature_tf = self.vectorizer.transform([formatted_page])

        svm_probability = self.svm_classifier.predict_proba(
            feature_tf.toarray())
        mnb_probability = self.mnb_classifier.predict_proba(
            feature_tf.toarray())

        # print(svm_probability, mnb_probability)

        # predicted_result = self.mnb_classifier.predict(feature_tf.toarray())
        # result = np.asscalar(np.int32(predicted_result[0]))

        result = self.weighted_mean_prediction(svm_probability[0, 1],
                                               mnb_probability[0, 1],
                                               svm_acc=0.8,
                                               mnb_acc=0.65)

        return result
Exemplo n.º 6
0
def count_words():
    web_pages_list = load_web_pages()
    stop_words_set = set(stopwords.words('english'))
    words_list = []

    idx = 0

    for web_page in web_pages_list:
        try:
            idx += 1
            print(idx)
            with open(web_page, 'r', encoding='utf8') as f:
                # clean page with pre-processor
                formatted_page = clean_page(f.read())
                f.close()

            # tokenize words with nltk tokenizer
            unfiltered_words = word_tokenize(formatted_page)

            # remove stop words, and non-alphabetic words from the bag of words
            for word in unfiltered_words:
                word = word.lower()
                if word.isalpha() and word not in stop_words_set:
                    words_list.append(word)
        except Exception as e:
            print(e)

    bag_of_words = Counter(words_list)
    print('Words =', len(words_list), '| BoW =', len(bag_of_words))
    print(bag_of_words.most_common())

    # saving BoW
    with open('bag_of_words.mdl', 'wb') as f:
        _pickle.dump(bag_of_words, f)
        print("Saved the BoW")

    return bag_of_words.most_common(5000)
Exemplo n.º 7
0
def create_dataset(bag_of_words):
    web_pages_list = load_web_pages()
    features_set = []
    labels = []

    # iterate the training dataset
    idx = 0
    for web_page in web_pages_list:
        try:
            idx += 1
            print(idx)
            with open(web_page, 'r', encoding='utf8') as f:
                # clean the web page
                formatted_page = clean_page(f.read())
                f.close()

            # tokenize the page text
            formatted_page_lower = formatted_page.lower()
            words_list = word_tokenize(formatted_page_lower)
            word_occurrence = []

            # calculate word occurrence in the page with bag of words
            for word_entry in bag_of_words:
                word_occurrence.append(words_list.count(word_entry[0]))

            # append the calculated word occurrence to features list
            features_set.append(word_occurrence)
            if 'non-profiles' in web_page:
                labels.append(0)  # 0 for non profile pages
            elif 'profiles' in web_page:
                labels.append(1)  # 1 for profile pages

        except Exception as e:
            print(e)

    return features_set, labels
Exemplo n.º 8
0
    svm_classifier = SVC(kernel='linear', C=2**1, probability=True)
    svm_classifier.fit(X_train_tf.toarray(), y_train)

    X_test_tf = vectorizer.transform(X_test)
    predictions = svm_classifier.predict(X_test_tf.toarray())

    print(accuracy_score(y_test, predictions), "\n")

    mnb_classifier = MultinomialNB()
    mnb_classifier.fit(X_train_tf.toarray(), y_train)

    mnb_predictions = mnb_classifier.predict(X_test_tf.toarray())
    print(accuracy_score(y_test, mnb_predictions))

    while True:
        inp = input('>> ')
        if inp == "exit":
            break

        test_page = requests.get(inp)

        formatted_page = clean_page(test_page.content)
        feature_tf = vectorizer.transform([formatted_page])

        pred = svm_classifier.predict(feature_tf.toarray())
        mnb_pred = mnb_classifier.predict_proba(feature_tf.toarray())

        print(svm_classifier.predict_proba(feature_tf.toarray()), "|",
              mnb_pred)
        print("Result: ", ['non-profile', 'profile'][pred[0]])