def execute_query(tx): skip_num = (page - 1) * page_size page_query = f" skip {skip_num} limit {page_size}" if page else "" query = ( "match (a:article)-[r:has_keyword]->(k:keyword)" + f" where {where_clause}" + " with count(distinct a) as total" + " match (a:article)-[r:has_keyword]->(k:keyword)" + f" where {where_clause}" + " with a, count(distinct k.name) as num, collect(k) as keywords, total order by num DESC" + f" return a, keywords, total" + page_query) articles = [] for record in tx.run(query): article_record = record["a"] unique_keywords = set( [keyword["name"] for keyword in record["keywords"]]) keywords = [(Keyword(name=keyword)) for keyword in unique_keywords] articles.append( Article( url=article_record["url"], title=article_record["title"], abstract=article_record["abstract"], keywords=keywords, )) articles_count["count"] = record["total"] articles_count["articles"] = articles
def _prepare_article_with_all_fields(self): article = Article(title=self.initial_article_test_data.title, description=self.initial_article_test_data.description, page=self.initial_article_test_data.page, binder=self.initial_article_test_data.binder, tags=self.initial_article_test_data.tags) SAVE_NEEDED.clear() return article
def add_new_article(self, title: str, description: str = '', page: str = '', binder: str = '', tags: List[str] = None) -> str: article = Article(title=title, description=description, page=page, binder=binder, tags=tags) try: self._article_list[article.uuid] = article except DuplicatedArticle: article.generate_uuid() self._article_list[article.uuid] = article return article.uuid
def test_create_article_with_title_only(self): article = Article(title=self.valid_article_test_data.title) self.assertIsInstance( article, Article, 'Verify added article is instance of Article class') Verifiers().verify_article_with_title_only( article=article, reference_article=self.valid_article_test_data) self.assertTrue(SAVE_NEEDED.is_set(), 'Verify save needed is set')
def test_create_article_no_title(self): with self.assertRaises( TypeError, msg= 'Verify exception is raised when creating Article instance without title' ): Article() self.assertFalse(SAVE_NEEDED.is_set())
def _crawl_and_get_article_for_url(link) -> Article: full_article_url = Crawler._get_pdf_link_from_href(link) pdf_url = f"{full_article_url}.full.pdf" content = Crawler._extract_raw_text(pdf_url) keywords_string = re.findall("(?<=Keywords:)(.*)(?=\\n)", content) keywords = [] if keywords_string: keywords_semicolon = [ keyword.strip().lower() for keyword in keywords_string[0].split(";") if keyword.strip() ] keywords_colon = [ keyword.strip().lower() for keyword in keywords_string[0].split(",") if keyword.strip() ] keywords = ( keywords_semicolon if len(keywords_semicolon) > len(keywords_colon) else keywords_colon ) title = link.find("span").text if not keywords: clean_title = title.lower() # Remove tags clean_title = re.sub("</?.*?>", " <> ", clean_title) # Remove special characters and digits clean_title = re.sub("(\\d|\\W)+", " ", clean_title) keywords = clean_title.split() # Remove the stop words keywords = [ Crawler.lem.lemmatize(word) for word in keywords if word not in Crawler.stop_words ] # Tag the words and only take the nouns tokens = nltk.word_tokenize(" ".join(keywords)) tagged = nltk.pos_tag(tokens) keywords = [tag[0] for tag in tagged if tag[1] == "NN" and len(tag[1]) > 1] # Ignore keywords in ignore list and the ones ending with ing keywords = [ keyword for keyword in keywords if keyword not in Crawler.ignore_keywords and len(keyword) > 1 and not keyword.endswith("ing") ] article = Article( url=full_article_url, title=title, abstract="", keywords=[Keyword(name=keyword) for keyword in keywords], ) return article
def test_add_article_with_the_same_uuid(self): article_dict = ArticleDict() article1 = Article(title='test1') article2 = Article(title='test2') article2._uuid = article1.uuid article_dict[article1.uuid] = article1 SAVE_NEEDED.clear() with self.assertRaises( DuplicatedArticle, msg= f'Verify exception is raised when adding article with the same uuid as existing' ): article_dict[article2.uuid] = article2 self.assertFalse( SAVE_NEEDED.is_set(), f'Verify save is not needed after exception of duplicated uuid') self.assertEqual( len(article_dict), 1, 'Verify duplicated article was not added to ArticleDict')
def test_sorting_articles(self): article_dict = ArticleDict() for article_title in self.sorting_articles: article = Article(title=article_title) article_dict[article.uuid] = article article_titles = [ a.title for a in article_dict.sort_by_title().values() ] self.sorting_articles.sort() self.assertEqual(article_titles, self.sorting_articles, 'Verify article dict is sorted')
def test_create_article_with_all_fields(self): article = Article(title=self.valid_article_test_data.title, description=self.valid_article_test_data.description, page=self.valid_article_test_data.page, binder=self.valid_article_test_data.binder, tags=self.valid_article_test_data.tags) self.assertIsInstance( article, Article, 'Verify added article is instance of Article class') Verifiers().verify_article_with_all_fields( article=article, reference_article=self.valid_article_test_data) self.assertTrue(SAVE_NEEDED.is_set(), 'Verify save needed is set')
def import_data(self): data = pandas.read_csv(self.file_path, sep=';', encoding='windows-1250') for i in data.index: article = Article(title=data['title'][i], description=data['description'][i], page=data['page'][i], binder=data['page'][i], tags=data['tags'][i].split(':')) self.articles_list[article.uuid] = article
def execute_query(tx): for record in tx.run( f"CALL db.index.fulltext.queryNodes('articleTitleAndAbstract', '{search_phrase}')" + " YIELD node, score" + " RETURN node, score"): articles.append(( Article( url=record[0]["url"], title=record[0]["title"], abstract=record[0]["url"], keywords=[], ), record[1], ))
def test_adding_articles(self): article_dict = ArticleDict() for article_data in self.articles_to_add.articles: article = Article(title=article_data.title, description=article_data.description, page=article_data.page, binder=article_data.binder, tags=article_data.tags) article_dict[article.uuid] = article self.assertEqual(article_dict[article.uuid], article, 'Verify article is added') self._save_needed_test( 'Verify save is needed after adding article')
def _type_assertion_test(self, title, msg, description='', page='', binder='', tags=None): if not tags: tags = [] with self.assertRaises(TypeError, msg=msg): Article(title=title, description=description, page=page, binder=binder, tags=tags)
def test_add_existing_article(self): article = Article(title=self.second_article.title, description=self.second_article.description, page=self.second_article.page, binder=self.second_article.binder, tags=self.second_article.tags) self.article_collection.add_existing_article(article) Verifiers().verify_article_with_all_fields( article=self.article_collection.get_article(article.uuid), reference_article=self.second_article) WaitMethods().wait_for_save_completed() new_collection = ArticleCollection() self.assertTrue( article.uuid in new_collection.articles_list.keys(), 'Verify new article uuid is present when creating new article collection (loaded from saved file)' ) Verifiers().verify_article_with_all_fields( article=new_collection.get_article(article.uuid), reference_article=self.second_article)
def _prepare_article_with_title_only(self): article = Article(title=self.initial_article_test_data.title) SAVE_NEEDED.clear() return article
from lib.article import Article ######################################## # # This is only an example, design before # implementation, do not worry about # optimization. # ######################################## if __name__ == "__main__": # as an aside, in the end this specific article should give a very bad rating when parsed PVTL = Article( "http://www.baystreet.ca/articles/techinsider.aspx?articleid=48609") from bs4 import BeautifulSoup soup = BeautifulSoup(PVTL.art.text) s = str(soup) from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() ss = sid.polarity_scores(s) for k in sorted(ss): print('{0}: {1}, '.format(k, ss[k]), end='')