def download_biographies(id): if id not in get_ids(): raise ValueError("No source with id '%s' was found" % id) for src in _repo.get_sources(): if src.id == id: break ls = biodes.parse_list(src.url) total = len(ls) skipped = 0 for index, biourl in enumerate(ls): if not biourl.startswith("http:"): # we're dealing with a fs path biourl = os.path.normpath(biourl) if not os.path.isabs(biourl): biourl = os.path.join(os.path.dirname(src.url), biourl) bio = Biography(source_id=src.id, repository=src.repository) try: bio.from_url(biourl) print "%s/%s %s" % (index + 1, total, bio.get_names()) except Exception, err: skipped += 1 print err continue try: _repo.add_biography(bio) except: from pdb import set_trace set_trace() ############################## Breakpoint ##############################
def download_biographies(self, source, limit=None): """Download all biographies from source.url and add them to the repository. Mark any biographies that we did not find (anymore), by removing the source_url property. Return the number of total and skipped biographies. arguments: source: a Source instance returns: a list of biography instances """ # at the URL given we find a list of links to biodes files # print 'Opening', source.url assert source.url, 'No URL was defined with the source "%s"' % source.id logging.info('downloading data at %s' % source.url) logging.info('parsing source url') # TODO: perhaps it would be better to check on Source.__init__ if repository argument is given if not source.repository: source.repository = self try: ls = biodes.parse_list(source.url) if limit: ls = ls[:limit] except etree.XMLSyntaxError, error: # @UndefinedVariable raise BioPortException('Error parsing data at %s -- check if this is valid XML\n%s' % (source.url, error))
def test_parse_list(self): url = os.path.join(this_dir, "list.xml") self.assertEqual(len(parse_list(url)), 2)