def fetch_and_save(self): rss_text = self.fetch() arxiv_xml = ArxivXml(rss_text) date = arxiv_xml.get_date() if not date: raise RssParseError('Could not parse dc:date.') if RssFetchHistory.exists(self._subject.id, date): rss_fetch_history = self._subject.rss_fetch_histories.create( date=date, is_duplicated=True, ) return [] rss_fetch_history = self._subject.rss_fetch_histories.create(date=date) papers = [] for paper_item in arxiv_xml.get_paper_items(): # avoid Google Tranlate API limit (100,000chars/100sec) if papers and len(papers) % 50 == 0: time.sleep(100) paper = Paper.from_xml(paper_item) rss_fetch_history.papers.add(paper, bulk=False) paper.add_authors(paper_item['authors']) papers.append(paper) return papers
def test_from_xml(self, _): arxiv_paper_item = { 'title': 'Some title.(arXiv:1611.07078v2 [cs.AI] UPDATED)', 'abstract': 'ABSTRACT', 'link': 'LINK', } result = Paper.from_xml(arxiv_paper_item) self.assertIsInstance(result, Paper) self.assertEqual(result.title, 'Some title.(arXiv:1611.07078v2 [cs.AI] UPDATED)') self.assertEqual(result.abstract, 'ABSTRACT') self.assertEqual(result.link, 'LINK') self.assertEqual(result.subject, 'cs.AI') self.assertEqual(result.submit_type, Paper.SUBMIT_TYPE_UPDATED)