def test_deplumpen_failures(self): soft_cases = [ ('asdf', 'asdf'), ('012345', '012345'), ] for given, expected in soft_cases: self.assertEqual(utils.deplumpen(given), expected) hard_cases = [None, [], {}, (), BaseCase] for case in hard_cases: self.assertRaises(ValueError, utils.deplumpen, case)
def path_count(pair): "figures out the type of the given path using the suffix (if one available)" try: if pair[0].lower().startswith('/content/early/'): # handles POA article variation 1 "/content/early/yyyy/mm/dd/doi/" type urls bits = pair[0].split('/', 6) bits[-1] = utils.deplumpen(bits[-1]) elif pair[0].lower().startswith('/content/elife/early/'): # handles POA article variation 2 "/content/elife/early/yyyy/mm/dd/doi/" type urls bits = pair[0].split('/', 7) bits[-1] = utils.deplumpen(bits[-1]) elif pair[0].lower().startswith('/content/elife/'): # handles valid but unsupported /content/elife/volume/id paths # these paths appear in PDF files I've been told bits = pair[0].split('/', 4) else: # handles standard /content/volume/id/ paths bits = pair[0].split('/', 3) art = bits[-1] art = art.lower() # website isn't case sensitive, we are more_bits = re.split(SPLITTER, art, maxsplit=1) suffix = None if len(more_bits) > 1: art, suffix = more_bits assert suffix in TYPE_MAP, "unknown suffix %r! received: %r split to %r" % ( suffix, pair, more_bits) return art, TYPE_MAP[suffix], int(pair[1]) except AssertionError, e: # we have an unhandled path #LOG.warn("skpping unhandled path %s (%r)", pair, e) LOG.warn("skpping unhandled path %s", pair)
def path_count(pair): "figures out the type of the given path using the suffix (if one available)" try: if pair[0].lower().startswith('/content/early/'): # handles POA article variation 1 "/content/early/yyyy/mm/dd/doi/" type urls bits = pair[0].split('/', 6) bits[-1] = utils.deplumpen(bits[-1]) elif pair[0].lower().startswith('/content/elife/early/'): # handles POA article variation 2 "/content/elife/early/yyyy/mm/dd/doi/" type urls bits = pair[0].split('/', 7) bits[-1] = utils.deplumpen(bits[-1]) elif pair[0].lower().startswith('/content/elife/'): # handles valid but unsupported /content/elife/volume/id paths # these paths appear in PDF files I've been told bits = pair[0].split('/', 4) else: # handles standard /content/volume/id/ paths bits = pair[0].split('/', 3) art = bits[-1] art = art.lower() # website isn't case sensitive, we are more_bits = re.split(SPLITTER, art, maxsplit=1) suffix = None if len(more_bits) > 1: art, suffix = more_bits assert suffix in TYPE_MAP, "unknown suffix %r! received: %r split to %r" % (suffix, pair, more_bits) return art, TYPE_MAP[suffix], int(pair[1]) except AssertionError, e: # we have an unhandled path #LOG.warn("skpping unhandled path %s (%r)", pair, e) LOG.warn("skpping unhandled path %s", pair)
def test_deplumpen(self): actual = utils.deplumpen("eLife.01234") self.assertEqual("e01234", actual)