def handle(self, *args, **options): # {year : publications} publications = {} if os.path.isfile(PROGRESS_FILE): # load our previous progress, if any with open(PROGRESS_FILE, 'rb') as progress_file: publications = pickle.load(progress_file) # remove it for now -- # if anything goes wrong, we'll save it again! os.remove(PROGRESS_FILE) for year in range(2007, date.today().year): if not year in publications: self.stdout.write("GETTING PAPERS FOR YEAR " + str(year)) try: publications[year] = \ list(scholar_data.get_published_papers(year, year)) except: self.abort(publications) raise self.stdout.write("ALL PAPERS GOTTEN, FILLING INFO FOR EACH") for year in publications: for publication in publications[year][:]: try: self.handle_publication(publication, year) publications[year].remove(publication) except: self.abort(publications) raise self.stdout.write('\n' * 4) self.stdout.write('#' * 10) self.stdout.write('\n' * 4) for year in range(2007, date.today().year): nih_data.scrape(str(year))
def test_scraping(self): p1 = DummyPublication( 'Hats and Stuff', 63, 'Hats, Bob and Hats, Billy', url='example.com', abstract='About hats and stuff', year=1992, volume=1) p2 = DummyPublication( 'Hats and Other Stuff', 36, 'Tanzi, Rudolph', url='example.com', abstract='More about hats and stuff', year=2015, volume=1) p3 = DummyPublication( 'No Abstract or website: Reviewed', 0, 'Bob, Billy', year=2015) p4 = DummyPublication( 'No Year: Revisited', 12, 'Bob, Billy and others', url='example.com', journal='Journal of Science') command = scrape.Command() command.handle_publication(p1, 1992) command.handle_publication(p2, 2015) command.handle_publication(p3, 2015) command.handle_publication(p4, 2014) nih_data.scrape('1992') nih_data.scrape('2015') for paper in Paper.objects.filter(citations=36): self.assertEqual(paper.title, 'Hats and Other Stuff') self.assertEqual(paper.year, 2015) query = Paper.objects.filter(url='example.com') self.assertEqual(len(query), 3) query = Author.objects.filter(name='others') self.assertEqual(len(query), 0) query = Paper.objects.filter(journal='Journal of Science') self.assertEqual(len(query), 1)
def handle(self, *args, **options): for year in range(2007, date.today().year): nih_data.scrape(str(year))