def add_path_entries(file_name, file_path, src_id, uname): """ add paths to file_path table """ try: existing = DBSession.query(Filedbentity).filter( Filedbentity.display_name == file_name).one_or_none() if not existing: logging.error('error with ' + file_name) path = DBSession.query(Path).filter_by(path=file_path).one_or_none() if path is None: logging.warning('Could not find path ') else: if existing: existing_filepath = DBSession.query(FilePath).filter( and_(FilePath.file_id == existing.dbentity_id, FilePath.path_id == path.path_id)).one_or_none() if not existing_filepath: new_filepath = FilePath(file_id=existing.dbentity_id, path_id=path.path_id, source_id=src_id, created_by=uname) DBSession.add(new_filepath) except Exception as e: logging.error("Exception occurred", exc_info=True)
def create_and_upload_file(obj, row_num): try: # find on local system if DATA_DIR: remote_file_path = DATA_DIR + '/' + obj['bun_path'] else: remote_file_path = obj['bun_path'] remote_file_path = remote_file_path + obj['display_name'] remote_file = open(remote_file_path) except IOError: logging.error('error opening file ' + str(row_num)) traceback.print_exc() return try: temp_engine = create_engine(NEX2_URI) session_factory = sessionmaker(bind=temp_engine, extension=ZopeTransactionExtension(), expire_on_commit=False) db_session = scoped_session(session_factory) # get README location readme_file_id = None if len(obj['readme_name']): readme = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['readme_name']).one_or_none() if readme is None: logging.warning('unable to find README ' + obj['readme_name']) else: readme_file_id = readme.dbentity_id # see if already exists, if not create existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() source_id = db_session.query(Source.source_id).filter( Source.display_name == obj['source']).one_or_none()[0] if not existing: try: data_id = db_session.query(Edam.edam_id).filter( Edam.edamid == obj['data_edam_id']).one_or_none()[0] format_id = db_session.query(Edam.edam_id).filter( Edam.edamid == obj['format_edam_id']).one_or_none()[0] topic_id = db_session.query(Edam.edam_id).filter( Edam.edamid == obj['topic_edam_id']).one_or_none()[0] except TypeError: logging.error('invalid EDAM id or source in row ' + str(row_num) + ' val in ' + obj['data_edam_id'] + ', ' + obj['format_edam_id'] + ', ' + obj['topic_edam_id']) return print("remote_file=", remote_file) upload_file(CREATED_BY, remote_file, filename=obj['display_name'], file_extension=obj['file_extension'], description=obj['description'].replace('"', ''), display_name=obj['display_name'], data_id=data_id, format_id=format_id, status=obj['status'], topic_id=topic_id, is_public=obj['is_public'], is_in_spell=obj['is_in_spell'], is_in_browser=obj['is_in_browser'], file_date=obj['file_date'], readme_file_id=readme_file_id, source_id=source_id) db_session.flush() else: existing.display_name = obj['display_name'] existing.description = obj['description'] existing.status = obj['status'] existing.is_public = obj['is_public'] existing.is_in_spell = obj['is_in_spell'] existing.is_in_browser = obj['is_in_browser'] existing.source_id = source_id # update file size if not existing.file_size and existing.s3_url: remote_file.seek(0, os.SEEK_END) file_size = remote_file.tell() remote_file.seek(0) existing.file_size = file_size if obj['file_date']: existing.file_date = obj['file_date'] existing.year = obj['file_date'].year existing.readme_file_id = readme_file_id remote_file.seek(0, os.SEEK_END) transaction.commit() existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() # only upload s3 file if not defined if existing.s3_url is None: existing.upload_file_to_s3(remote_file, obj['display_name']) db_session.flush() # add path entries existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() if not existing: logging.error('error with ' + obj['display_name'] + ' in row ' + str(row_num)) return path = db_session.query(Path).filter_by( path=obj['new_path']).one_or_none() if path is None: logging.warning('Could not find path ' + obj['new_path'] + ' in row ' + str(row_num)) return existing_filepath = db_session.query(FilePath).filter( and_(FilePath.file_id == existing.dbentity_id, FilePath.path_id == path.path_id)).one_or_none() if not existing_filepath: new_filepath = FilePath(file_id=existing.dbentity_id, path_id=path.path_id, source_id=SGD_SOURCE_ID, created_by=CREATED_BY) db_session.add(new_filepath) transaction.commit() db_session.flush() # maybe add PMIDs if len(obj['pmids']): existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() pmids = obj['pmids'].split('|') for x in pmids: x = int(x.strip()) existing_ref_file = db_session.query(ReferenceFile).filter( ReferenceFile.file_id == existing.dbentity_id).one_or_none() ref = db_session.query(Referencedbentity).filter( Referencedbentity.pmid == x).one_or_none() if ref and not existing_ref_file: new_ref_file = ReferenceFile(created_by=CREATED_BY, file_id=existing.dbentity_id, reference_id=ref.dbentity_id, source_id=SGD_SOURCE_ID) db_session.add(new_ref_file) transaction.commit() db_session.flush() # maybe add keywords if len(obj['keywords']): existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() keywords = obj['keywords'].split('|') for x in keywords: x = x.strip() keyword = db_session.query(Keyword).filter( Keyword.display_name == x).one_or_none() existing_file_keyword = db_session.query(FileKeyword).filter( and_(FileKeyword.file_id == existing.dbentity_id, FileKeyword.keyword_id == keyword.keyword_id)).one_or_none() if not existing_file_keyword: new_file_keyword = FileKeyword( created_by=CREATED_BY, file_id=existing.dbentity_id, keyword_id=keyword.keyword_id, source_id=SGD_SOURCE_ID) db_session.add(new_file_keyword) transaction.commit() db_session.flush() remote_file.close() logging.info('finished ' + obj['display_name'] + ', line ' + str(row_num)) except: logging.error('error with ' + obj['display_name'] + ' in row ' + str(row_num)) traceback.print_exc() db_session.rollback() db_session.close()
def update_database_load_file_to_s3(nex_session, gzip_file, source_to_id, edam_to_id): local_file = open(gzip_file, mode='rb') import hashlib file_md5sum = hashlib.md5(local_file.read()).hexdigest() row = nex_session.query(Filedbentity).filter_by( md5sum=file_md5sum).one_or_none() if row is not None: return if "tbl" in gzip_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('ncbi_tbl_files.%.tar.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') elif "sqn" in gzip_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('ncbi_sqn_files.%.tar.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') else: nex_session.query(Dbentity).filter( Dbentity.display_name.like('ncbi_gbf_files.%.tar.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:3671') ## data:3671 Text topic_id = edam_to_id.get('EDAM:0085') ## topic:0085 Functional genomics format_id = edam_to_id.get('EDAM:3507') ## format:3507 Document format if "tbl" in gzip_file: from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) # readme = nex_session.query(Dbentity).filter_by(display_name="ncbi_tab_files.README", dbentity_status='Active').one_or_none() # if readme is None: # log.info("ncbi_tbl_files.README is not in the database.") # return # readme_file_id = readme.dbentity_id readme_file_id = None # path.path = /reports/function upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='All yeast features in tbl file format', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=readme_file_id, is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=file_md5sum) file = nex_session.query(Dbentity).filter_by( display_name=gzip_file, dbentity_status='Active').one_or_none() if file is None: log.info("The " + gzip_file + " is not in the database.") return file_id = file.dbentity_id path = nex_session.query(Path).filter_by( path="/reports/function").one_or_none() if path is None: log.info("The path /reports/function is not in the database.") return path_id = path.path_id x = FilePath(file_id=file_id, path_id=path_id, source_id=source_to_id['SGD'], created_by=CREATED_BY) nex_session.add(x) nex_session.commit()
def update_database_load_file_to_s3(nex_session, gaf_file, is_public, source_to_id, edam_to_id, datestamp): # gene_association.sgd.20171204.gz # gene_association.sgd-yeastmine.20171204.gz # datestamp = str(datetime.now()).split(" ")[0].replace("-", "") gzip_file = gaf_file + "." + datestamp + ".gz" import gzip import shutil with open(gaf_file, 'rb') as f_in, gzip.open(gzip_file, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) local_file = open(gzip_file) import hashlib gaf_md5sum = hashlib.md5(local_file.read()).hexdigest() row = nex_session.query(Filedbentity).filter_by( md5sum=gaf_md5sum).one_or_none() if row is not None: return gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "") # nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').update({"dbentity_status": 'Archived'}) if is_public == 1: nex_session.query(Dbentity).filter( Dbentity.display_name.like('gene_association.sgd%')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:2048') ## data:2048 Report topic_id = edam_to_id.get('EDAM:0085') ## topic:0085 Functional genomics format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV if "yeastmine" not in gaf_file: from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) readme = nex_session.query(Dbentity).filter_by( display_name="gene_association.README", dbentity_status='Active').one_or_none() if readme is None: log.info("gene_association.README is not in the database.") return readme_file_id = readme.dbentity_id # path.path = /reports/function upload_file( CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description= 'All GO annotations for yeast genes (protein and RNA) in GAF file format', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=readme_file_id, is_public=is_public, is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD']) gaf = nex_session.query(Dbentity).filter_by( display_name=gzip_file, dbentity_status='Active').one_or_none() if gaf is None: log.info("The " + gzip_file + " is not in the database.") return file_id = gaf.dbentity_id path = nex_session.query(Path).filter_by( path="/reports/function").one_or_none() if path is None: log.info("The path /reports/function is not in the database.") return path_id = path.path_id x = FilePath(file_id=file_id, path_id=path_id, source_id=source_to_id['SGD'], created_by=CREATED_BY) nex_session.add(x) nex_session.commit()
def update_database_load_file_to_s3(nex_session, gff_file, gzip_file, source_to_id, edam_to_id): local_file = open(gzip_file, mode='rb') ### upload a current GFF file to S3 with a static URL for Go Community ### upload_gff_to_s3(local_file, "latest/saccharomyces_cerevisiae.gff.gz") ########################################################################## import hashlib gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest() row = nex_session.query(Filedbentity).filter_by(md5sum = gff_md5sum).one_or_none() if row is not None: return gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "") nex_session.query(Dbentity).filter(Dbentity.display_name.like('saccharomyces_cerevisiae.%.gff.gz')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:3671') ## data:3671 Text topic_id = edam_to_id.get('EDAM:3068') ## topic:3068 Literature and language format_id = edam_to_id.get('EDAM:3507') ## format:3507 Document format from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) readme = nex_session.query(Dbentity).filter_by(display_name="saccharomyces_cerevisiae_gff.README", dbentity_status='Active').one_or_none() if readme is None: log.info("saccharomyces_cerevisiae_gff.README is not in the database.") return readme_file_id = readme.dbentity_id # path.path = /reports/chromosomal-features upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='GFF file for yeast genes (protein and RNA)', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=readme_file_id, is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gff_md5sum) gff = nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').one_or_none() if gff is None: log.info("The " + gzip_file + " is not in the database.") return file_id = gff.dbentity_id path = nex_session.query(Path).filter_by(path="/reports/chromosomal-features").one_or_none() if path is None: log.info("The path: /reports/chromosomal-features is not in the database.") return path_id = path.path_id x = FilePath(file_id = file_id, path_id = path_id, source_id = source_to_id['SGD'], created_by = CREATED_BY) nex_session.add(x) nex_session.commit() log.info("Done uploading " + gff_file)
def update_database_load_file_to_s3(nex_session, data_file, gzip_file, source_to_id, edam_to_id): local_file = open(gzip_file, mode='rb') import hashlib gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest() row = nex_session.query(Filedbentity).filter_by( md5sum=gff_md5sum).one_or_none() if row is not None: return gzip_file = gzip_file.replace("scripts/dumping/ncbi/data/", "") nex_session.query(Dbentity).filter( Dbentity.display_name.like('RNAcentral.%.json.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:3495') # data:3495 RNA sequence topic_id = edam_to_id.get('EDAM:0099') # topic:0099 RNA format_id = edam_to_id.get('EDAM:3464') # format:3464 JSON format from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='JSON file for yeast RNA genes', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=None, is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gff_md5sum) rnaFile = nex_session.query(Dbentity).filter_by( display_name=gzip_file, dbentity_status='Active').one_or_none() if rnaFile is None: log.info("The " + gzip_file + " is not in the database.") return file_id = rnaFile.dbentity_id path = nex_session.query(Path).filter_by( path="/reports/chromosomal-features").one_or_none() if path is None: log.info( "The path: /reports/chromosomal-features is not in the database.") return path_id = path.path_id x = FilePath(file_id=file_id, path_id=path_id, source_id=source_to_id['SGD'], created_by=CREATED_BY) nex_session.add(x) nex_session.commit() log.info("Done uploading " + data_file)