Пример #1
0
 def execute_query(tx):
     skip_num = (page - 1) * page_size
     page_query = f" skip {skip_num} limit {page_size}" if page else ""
     query = (
         "match (a:article)-[r:has_keyword]->(k:keyword)" +
         f" where {where_clause}" + " with count(distinct a) as total" +
         " match (a:article)-[r:has_keyword]->(k:keyword)" +
         f" where {where_clause}" +
         " with a, count(distinct k.name) as num, collect(k) as keywords, total order by num DESC"
         + f" return a, keywords, total" + page_query)
     articles = []
     for record in tx.run(query):
         article_record = record["a"]
         unique_keywords = set(
             [keyword["name"] for keyword in record["keywords"]])
         keywords = [(Keyword(name=keyword))
                     for keyword in unique_keywords]
         articles.append(
             Article(
                 url=article_record["url"],
                 title=article_record["title"],
                 abstract=article_record["abstract"],
                 keywords=keywords,
             ))
         articles_count["count"] = record["total"]
     articles_count["articles"] = articles
Пример #2
0
 def _prepare_article_with_all_fields(self):
     article = Article(title=self.initial_article_test_data.title,
                       description=self.initial_article_test_data.description,
                       page=self.initial_article_test_data.page, binder=self.initial_article_test_data.binder,
                       tags=self.initial_article_test_data.tags)
     SAVE_NEEDED.clear()
     return article
Пример #3
0
 def add_new_article(self,
                     title: str,
                     description: str = '',
                     page: str = '',
                     binder: str = '',
                     tags: List[str] = None) -> str:
     article = Article(title=title,
                       description=description,
                       page=page,
                       binder=binder,
                       tags=tags)
     try:
         self._article_list[article.uuid] = article
     except DuplicatedArticle:
         article.generate_uuid()
         self._article_list[article.uuid] = article
     return article.uuid
Пример #4
0
 def test_create_article_with_title_only(self):
     article = Article(title=self.valid_article_test_data.title)
     self.assertIsInstance(
         article, Article,
         'Verify added article is instance of Article class')
     Verifiers().verify_article_with_title_only(
         article=article, reference_article=self.valid_article_test_data)
     self.assertTrue(SAVE_NEEDED.is_set(), 'Verify save needed is set')
Пример #5
0
 def test_create_article_no_title(self):
     with self.assertRaises(
             TypeError,
             msg=
             'Verify exception is raised when creating Article instance without title'
     ):
         Article()
     self.assertFalse(SAVE_NEEDED.is_set())
Пример #6
0
    def _crawl_and_get_article_for_url(link) -> Article:
        full_article_url = Crawler._get_pdf_link_from_href(link)
        pdf_url = f"{full_article_url}.full.pdf"
        content = Crawler._extract_raw_text(pdf_url)
        keywords_string = re.findall("(?<=Keywords:)(.*)(?=\\n)", content)
        keywords = []
        if keywords_string:
            keywords_semicolon = [
                keyword.strip().lower()
                for keyword in keywords_string[0].split(";")
                if keyword.strip()
            ]
            keywords_colon = [
                keyword.strip().lower()
                for keyword in keywords_string[0].split(",")
                if keyword.strip()
            ]
            keywords = (
                keywords_semicolon
                if len(keywords_semicolon) > len(keywords_colon)
                else keywords_colon
            )
        title = link.find("span").text
        if not keywords:
            clean_title = title.lower()
            # Remove tags
            clean_title = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", clean_title)
            # Remove special characters and digits
            clean_title = re.sub("(\\d|\\W)+", " ", clean_title)
            keywords = clean_title.split()
            # Remove the stop words
            keywords = [
                Crawler.lem.lemmatize(word)
                for word in keywords
                if word not in Crawler.stop_words
            ]
            # Tag the words and only take the nouns
            tokens = nltk.word_tokenize(" ".join(keywords))
            tagged = nltk.pos_tag(tokens)
            keywords = [tag[0] for tag in tagged if tag[1] == "NN" and len(tag[1]) > 1]
            # Ignore keywords in ignore list and the ones ending with ing
            keywords = [
                keyword
                for keyword in keywords
                if keyword not in Crawler.ignore_keywords
                and len(keyword) > 1
                and not keyword.endswith("ing")
            ]

        article = Article(
            url=full_article_url,
            title=title,
            abstract="",
            keywords=[Keyword(name=keyword) for keyword in keywords],
        )
        return article
Пример #7
0
 def test_add_article_with_the_same_uuid(self):
     article_dict = ArticleDict()
     article1 = Article(title='test1')
     article2 = Article(title='test2')
     article2._uuid = article1.uuid
     article_dict[article1.uuid] = article1
     SAVE_NEEDED.clear()
     with self.assertRaises(
             DuplicatedArticle,
             msg=
             f'Verify exception is raised when adding article with the same uuid as existing'
     ):
         article_dict[article2.uuid] = article2
     self.assertFalse(
         SAVE_NEEDED.is_set(),
         f'Verify save is not needed after exception of duplicated uuid')
     self.assertEqual(
         len(article_dict), 1,
         'Verify duplicated article was not added to ArticleDict')
Пример #8
0
 def test_sorting_articles(self):
     article_dict = ArticleDict()
     for article_title in self.sorting_articles:
         article = Article(title=article_title)
         article_dict[article.uuid] = article
     article_titles = [
         a.title for a in article_dict.sort_by_title().values()
     ]
     self.sorting_articles.sort()
     self.assertEqual(article_titles, self.sorting_articles,
                      'Verify article dict is sorted')
Пример #9
0
 def test_create_article_with_all_fields(self):
     article = Article(title=self.valid_article_test_data.title,
                       description=self.valid_article_test_data.description,
                       page=self.valid_article_test_data.page,
                       binder=self.valid_article_test_data.binder,
                       tags=self.valid_article_test_data.tags)
     self.assertIsInstance(
         article, Article,
         'Verify added article is instance of Article class')
     Verifiers().verify_article_with_all_fields(
         article=article, reference_article=self.valid_article_test_data)
     self.assertTrue(SAVE_NEEDED.is_set(), 'Verify save needed is set')
Пример #10
0
    def import_data(self):
        data = pandas.read_csv(self.file_path,
                               sep=';',
                               encoding='windows-1250')
        for i in data.index:
            article = Article(title=data['title'][i],
                              description=data['description'][i],
                              page=data['page'][i],
                              binder=data['page'][i],
                              tags=data['tags'][i].split(':'))

            self.articles_list[article.uuid] = article
Пример #11
0
 def execute_query(tx):
     for record in tx.run(
             f"CALL db.index.fulltext.queryNodes('articleTitleAndAbstract', '{search_phrase}')"
             + " YIELD node, score" + " RETURN node, score"):
         articles.append((
             Article(
                 url=record[0]["url"],
                 title=record[0]["title"],
                 abstract=record[0]["url"],
                 keywords=[],
             ),
             record[1],
         ))
Пример #12
0
 def test_adding_articles(self):
     article_dict = ArticleDict()
     for article_data in self.articles_to_add.articles:
         article = Article(title=article_data.title,
                           description=article_data.description,
                           page=article_data.page,
                           binder=article_data.binder,
                           tags=article_data.tags)
         article_dict[article.uuid] = article
         self.assertEqual(article_dict[article.uuid], article,
                          'Verify article is added')
         self._save_needed_test(
             'Verify save is needed after adding article')
Пример #13
0
 def _type_assertion_test(self,
                          title,
                          msg,
                          description='',
                          page='',
                          binder='',
                          tags=None):
     if not tags:
         tags = []
     with self.assertRaises(TypeError, msg=msg):
         Article(title=title,
                 description=description,
                 page=page,
                 binder=binder,
                 tags=tags)
Пример #14
0
 def test_add_existing_article(self):
     article = Article(title=self.second_article.title,
                       description=self.second_article.description,
                       page=self.second_article.page,
                       binder=self.second_article.binder,
                       tags=self.second_article.tags)
     self.article_collection.add_existing_article(article)
     Verifiers().verify_article_with_all_fields(
         article=self.article_collection.get_article(article.uuid),
         reference_article=self.second_article)
     WaitMethods().wait_for_save_completed()
     new_collection = ArticleCollection()
     self.assertTrue(
         article.uuid in new_collection.articles_list.keys(),
         'Verify new article uuid is present when creating new article collection (loaded from saved file)'
     )
     Verifiers().verify_article_with_all_fields(
         article=new_collection.get_article(article.uuid),
         reference_article=self.second_article)
Пример #15
0
 def _prepare_article_with_title_only(self):
     article = Article(title=self.initial_article_test_data.title)
     SAVE_NEEDED.clear()
     return article
Пример #16
0
from lib.article import Article

########################################
#
# This is only an example, design before
# implementation, do not worry about
# optimization.
#
########################################

if __name__ == "__main__":
    # as an aside, in the end this specific article should give a very bad rating when parsed
    PVTL = Article(
        "http://www.baystreet.ca/articles/techinsider.aspx?articleid=48609")
    from bs4 import BeautifulSoup

    soup = BeautifulSoup(PVTL.art.text)
    s = str(soup)
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(s)

    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')