def test_id_parser(self): self.identifier = arxiv_id_parser.arxiv_id_parser(FILE_SOURCE_PDF, FILE_NAME_PDF) self.assertEqual(self.identifier, u'1305.5767v1') self.identifier = arxiv_id_parser.arxiv_id_parser(FILE_SOURCE_TXT, FILE_NAME_TXT) self.assertRaises(Exception('NonArxivPdf'), self.identifier)
def test_id_parser(self): self.identifier = arxiv_id_parser.arxiv_id_parser( FILE_SOURCE_PDF, FILE_NAME_PDF) self.assertEqual(self.identifier, u'1305.5767v1') self.identifier = arxiv_id_parser.arxiv_id_parser( FILE_SOURCE_TXT, FILE_NAME_TXT) self.assertRaises(Exception('NonArxivPdf'), self.identifier)
def arxiv_pyler(path): if os.path.exists(path): COUNTER = 0 if path==None: return entry_list=[] print path for root, dirs, files in os.walk(path): for pdf in files: if re.search('.pdf$', pdf): try: source = os.path.join(root, pdf) identifier = arxiv_id_parser(source, pdf) xml_content = arxiv_query(identifier) entry_content = xml_parser(xml_content, identifier) entry_list.append(entry_content) entry_list[COUNTER].update({'file_source':source}) #shutil.move(source, pdf_dir) COUNTER += 1 except Exception as inst: for error in inst.args: print error else: continue html_generator(entry_list) print 'complete' return entry_list else: print 'path does not exits.'