def test_id2ptree(self): for case in self.i2ptree_tests: if len(case) == 3: result = id2ptree(case[0]) elif len(case) == 4: # uses custom separator result = id2ptree(case[0], sep=case[3]) msg = "%s: id2ptree(%s) = %s but got %s" % \ (case[2], case[0], case[1], result) self.assertEqual(result, case[1], msg=msg)
def download(): count = 0 for item in get_items(): count += 1 id = item.find('./{http://tempuri.org/}MapaDigital').text if not id or id == "0": continue # write out metadata data_dir = 'data' + ptree.id2ptree(id) if not os.path.isdir(data_dir): os.makedirs(data_dir) et.cleanup_namespaces(item) xml = et.tostring(item, pretty_print=True) # no need to refetch metadata_file = os.path.join(data_dir, "%s.xml" % id) if os.path.isfile(metadata_file): continue open(metadata_file, "w").write(xml) # be nice :) time.sleep(1) # try to download shapefile zip_url = item.find('./{http://tempuri.org/}UrlZip').text if zip_url: r = requests.get(zip_url, headers={"User-Agent": UA}) if r.headers['Content-Type'] == 'application/x-zip-compressed': zip_file = metadata_file.replace(".xml", ".zip") open(zip_file, "wb").write(r.content) print "%s %s %s %s" % (datetime.datetime.now(), count, id, zip_file)
def fetch(url): """ GETs a url, extracts RDFa from it, and persists it to disk. fetch() will return the name of the file where the metadata was stored, or None if the RDF was already fetched. """ dirname = "store" + ptree.id2ptree(url) path = os.path.join(dirname, "metadata.nt") # if it's there already don't bother getting it again if os.path.isfile(path): logging.info("already harvested %s as %s" % (url, path)) return None # create the directory if necessary if not os.path.isdir(dirname): os.makedirs(dirname) # extract rdfa and save it try: graph = rdflib.Graph() html = urllib.urlopen(url).read() graph.parse(data=html, format="rdfa") triples = len(graph) graph.serialize(open(path, "w"), format="nt") logging.info("saved %s as %i triples in %s" % (url, triples, path)) except: logging.exception("unable to extract rdfa from %s" % url) # be nice time.sleep(2) return path
def harvest(base_url, metadata_prefix='oai_dc', set_name=None): url = "%s?verb=ListRecords&metadataPrefix=%s" % (base_url, metadata_prefix) if set_name: url += "&set=%s" % set_name while True: doc = etree.parse(url) for record in doc.xpath('oai:ListRecords/oai:record', namespaces=ns): # determine the record identifier id = record.xpath('string(oai:header/oai:identifier)', namespaces=ns) # write out the record to a pair tree d = "data" + id2ptree(id) if not os.path.isdir(d): os.makedirs(d) p = os.path.join(d, "%s-%s.xml" % (id, metadata_prefix)) open(p, "w").write(etree.tostring(record)) print "saved %s as %s" % (id, p) # handle resumption token t = doc.xpath('string(oai:ListRecords/oai:resumptionToken)', namespaces=ns) if not t: break url = "%s?verb=ListRecords&resumptionToken=%s" % (base_url, t)
def load_fulltext(bibcode, field_name): ptree_path = ptree.id2ptree(bibcode) # TODO: make this path a config setting again full_path = '/proj/ads/fulltext/extracted%s%s.txt' % (ptree_path, field_name) if os.path.exists(full_path): fo = open(full_path, 'r') text = fo.read() fo.close() return text.decode('utf-8') else: return u""
def display_image(bibcode,figure_id,image_format): """ For a given article, figure ID and format, fetch and display the image """ format2ext = {'tb':'gif','lr':'jpg','hr':'png'} image_ext = format2ext.get(image_format,'png') image_dir = config.IMAGE_PATH + ptree.id2ptree(bibcode) image = "%s%s_%s_%s.%s" % (image_dir,bibcode,figure_id,image_format,image_ext) try: image_data = open(image, "rb").read() except Exception, e: app.logger.error('ID %s. Unable to get image %s (format: %s) for bibcode : %s! (%s)' % (g.user_cookie_id,figure_id,image_format,bibcode,e)) return ('', 204)
def __init__(self, bibcode, ft_source, provider): self.bibcode = bibcode self.ft_source = ft_source self.provider = provider self.extract_dir = config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode) self.meta_path = os.path.join(self.extract_dir, 'meta.json') self.source_loaded = False self.source_content = None self.dry_run = False self.last_extracted = self.get_last_extracted() log.debug("%s last extracted: %s", self.bibcode, self.last_extracted)
def create_meta_path(dict_input, extract_path): """ Converts the BibCode of the file into a pair tree path name. For example, 2015TEST would be converted into '20/15/TE/ST/'. :param dict_input: meta-data content of the article given :param extract_key: path to extract the full text content to :return: BibCodes pair tree path """ ptr = ptree.id2ptree(dict_input['bibcode']) extract_path = extract_path + ptr + 'meta.json' logger.debug('extract_path: {0}'.format(extract_path)) return extract_path
def create_meta_path(dict_input, extract_key='FULLTEXT_EXTRACT_PATH'): """ Converts the BibCode of the file into a pair tree path name. For example, 2015TEST would be converted into '20/15/TE/ST/'. :param dict_input: meta-data content of the article given :param extract_key: path to extract the full text content to :return: BibCodes pair tree path """ ptr = ptree.id2ptree(dict_input[CONSTANTS['BIBCODE']]) extract_path = config[extract_key] + ptr + 'meta.json' logger.debug('extract_path: {0}'.format(extract_path)) return extract_path
def run(self): while True: doc = self.queue.get() if doc is None: log.debug("Nothing left to do for worker %s", self.name) self.queue.task_done() break self.stats['processed'] += 1 log.info("Worker %s is working on %s", self.name, doc['bibcode']) extract_dir = config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree( doc['bibcode']) meta_path = os.path.join(extract_dir, 'meta.json') log.debug("meta path: %s", meta_path) # dry-run testing # self.queue.task_done() # continue if not os.path.exists(extract_dir): log.debug("no existing extract dir for %s", doc['bibcode']) self.stats['missing'] += 1 self.queue.task_done() continue if os.path.exists(meta_path) and not self.opts.force: log.debug("found existing meta file for %s", doc['bibcode']) self.queue.task_done() continue meta = { 'ft_source': doc['ft_source'], 'provider': doc['ft_provider'], 'index_date': doc['index_date'] } log.debug("writing meta file for %s", doc['bibcode']) with open(meta_path, 'w') as f: json.dump(meta, f) mtime = time.mktime(doc['_generated'].timetuple()) log.debug("setting mtime for %s to %s, %s", meta_path, doc['_generated'], mtime) os.utime(meta_path, (mtime, mtime)) self.queue.task_done()
def run(self): while True: doc = self.queue.get() if doc is None: log.debug("Nothing left to do for worker %s", self.name) self.queue.task_done() break self.stats['processed'] += 1 log.info("Worker %s is working on %s", self.name, doc['bibcode']) extract_dir = config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(doc['bibcode']) meta_path = os.path.join(extract_dir, 'meta.json') log.debug("meta path: %s", meta_path) # dry-run testing # self.queue.task_done() # continue if not os.path.exists(extract_dir): log.debug("no existing extract dir for %s", doc['bibcode']) self.stats['missing'] += 1 self.queue.task_done() continue if os.path.exists(meta_path) and not self.opts.force: log.debug("found existing meta file for %s", doc['bibcode']) self.queue.task_done() continue meta = { 'ft_source': doc['ft_source'], 'provider': doc['ft_provider'], 'index_date': doc['index_date'] } log.debug("writing meta file for %s", doc['bibcode']) with open(meta_path,'w') as f: json.dump(meta, f) mtime = time.mktime(doc['_generated'].timetuple()) log.debug("setting mtime for %s to %s, %s", meta_path, doc['_generated'], mtime) os.utime(meta_path, (mtime, mtime)) self.queue.task_done()
def __init__(self, bibcode, ft_source, provider, config=False): if not config: self.config = utils.load_config() else: self.config = config self.bibcode = bibcode self.ft_source = ft_source self.provider = provider self.extract_dir = self.config[ 'FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode) self.meta_path = os.path.join(self.extract_dir, 'meta.json') self.source_loaded = False self.source_content = None self.dry_run = False self.last_extracted = self.get_last_extracted() log.debug("%s last extracted: %s", self.bibcode, self.last_extracted)
def workout_field_value(message): sender = str(message.getSender()) if sender in 'PythonTextField': value = message.getParam('externalVal') if not value: return value = str(value) message.threadInfo('searching for ' + value) vals = {} ret = None if value: parts = value.split('|') for p in parts: k, v = p.split(':', 1) if v[0] == '[' and v[-1] == ']': v = v[1:-1] vals[k] = v if 'bibcode' in vals and 'src_dir' in vals: if vals['src_dir'] == "mongo": mongo = pymongo.Connection('adszee') docs = mongo['solr4ads']['docs'] bib = vals['bibcode'] doc = docs.find_one({'bibcode': bib}, {'body': 1}) if doc: message.setResults(doc['body']) return else: dirs = vals['src_dir'].split(',') bib = vals['bibcode'].split(',')[0].strip() ptree_path = ptree.id2ptree(bib) for d in dirs: full_path = d + ptree_path + 'body.txt' message.threadInfo('looking for ' + full_path) if os.path.exists(full_path): fo = open(full_path, 'r') ret = fo.read() message.setResults(ret.decode('utf-8')) return
def get_path_to_htid(self, htid): """ Returns the path to the pairtree directory for this htid. Args should include the id namespace, eg: dul1.ark:/13960/t00z7x54f uc2.ark:/13960/t9p26rn3h etc. Returns a tuple - (path, postfix) """ ns, post = htid.split('.') posttree = ptree.id2ptree(post) posttree = posttree.strip('/') # / at front of string break path join post = self.encode(post) # replace :, /, etc. l = [self.cpath, ns, 'pairtree_root', posttree, post] fullpath = os.path.join(*l) if not os.path.exists(fullpath): raise ValueError("Is id {} in the collection? Path {} not found." .format(htid, fullpath)) return fullpath, post
def get_response(): accept_formats = os.environ.get('HTTP_ACCEPT', []) if accept_formats: accept_formats = accept_formats.split(',') form = cgi.FieldStorage() #print 'Content-Type: text/plain\n' #print form #print 'Content-Type: text/html\n' #cgi.print_environ() if os.environ['REQUEST_METHOD'] == 'GET': identity = None filename = 'meta.json' if 'id' in form: identity = form['id'].value if 'f' in form: filename = form['f'].value if identity: ppath = ptree.id2ptree(identity) name = os.path.basename(filename) url = 'http://data.free103point9.org/r%s%s' % (ppath, name) return '''Content-Type: text/plain Location: %(url)s %(url)s''' % {'url': url} if os.environ['REQUEST_METHOD'] == 'POST': if 'id' in form and 'file' in form: identity = form['id'].value # A nested FieldStorage instance holds the file fileitem = form['file'] if identity and fileitem.filename: ppath = ptree.id2ptree(identity) home = '../r%s' % ppath try: os.makedirs(home) except OSError: pass # strip leading path from file name to avoid directory traversal attacks name = os.path.basename(fileitem.filename) f = open('../r%s%s' % (ppath, name), 'wb', 10000) # Read the file in chunks for chunk in fbuffer(fileitem.file): f.write(chunk) f.close() log(datetime.now().isoformat() + ' POST ' + identity + ' ' + name) message = "The file %s was uploaded successfully" % name if 'text/html' in accept_formats: return html_response('<p>%s</p>' % message) else: return 'Content-Type: text/plain\n\n%s' % message if os.environ['REQUEST_METHOD'] == 'DELETE': if 'id' in form and 'filename' in form: identity = form['id'].value filename = form['filename'].value if identity and filename: ppath = ptree.id2ptree(identity) dir = '../r%s' % ppath name = os.path.basename(filename) os.remove(dir + name) try: os.removedirs(dir) # remove parent directories if empty except OSError: pass log(datetime.now().isoformat() + ' DELETE ' + identity + ' ' + name) message = "The file %s was deleted successfully" % name if 'text/html' in accept_formats: return html_response('<p></p>') else: return 'Content-Type: text/plain\n\n%s' % message if 'text/html' in accept_formats: return html_response(""" <form enctype="multipart/form-data" method="post"> <input type="text" name="id"> <input type="file" name="file"> <input type="submit"> </form>""") else: return 'Content-Type: text/plain\n\ndata.free103point9.org'
#!/usr/bin/env python import os import json import ptree from internetarchive import search_items, Item total_bytes = 0 for result in search_items('collection:usda-nurseryandseedcatalog'): id = result['identifier'] item = Item(id) metadata = item.get_metadata() item_dir = os.path.join('items', ptree.id2ptree(id).lstrip("/")) if not os.path.isdir(item_dir): os.makedirs(item_dir) with open(os.path.join(item_dir, 'metadata.json'), 'w') as fh: fh.write(json.dumps(metadata, indent=2)) total_bytes += sum([f.size for f in item.iter_files()]) print item_dir print total_bytes
Check for the existance of the fulltext body and meta.json files AA 11/1/16 """ import json import ptree import fileinput import os from settings import config if __name__ == '__main__': for line in fileinput.input(): bibcode, fname, provider = line.strip().split() f = config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode) if not os.path.exists(f): print "{0}: missing_dir {1}".format(bibcode, f) meta = f + 'meta.json' full = f + 'fulltext.txt' if not os.path.exists(meta): print "{0} : missing_meta {1}".format(bibcode, meta) continue if not os.path.exists(full): print "{0} : missing_ft {1}".format(bibcode, full) try: d = json.load(open(meta)) ts = d['index_date'] except KeyError: print "{0}: missing_date {1}".format(bibcode, meta)