def upload_file_helper(CREATED_BY, remote_file, obj, full_file_path): """ upload file to s3 and update db with s3_url Paramaters ---------- CREATED_BY: str remote_file: file obj obj: dict """ try: upload_file(CREATED_BY, remote_file, filename=obj['display_name'], file_extension=obj['file_extension'], description=obj['description'], display_name=obj['display_name'], data_id=obj['data_id'], format_id=obj['format_id'], status=obj['status'], topic_id=obj['topic_id'], is_public=obj['is_public'], is_in_spell=obj['is_in_spell'], is_in_browser=obj['is_in_browser'], file_date=obj['file_date'], readme_file_id=obj['readme_file_id'], source_id=obj['source_id'], full_file_path=full_file_path) except Exception as e: logging.error("Exception occurred", exc_info=True)
def update_database_load_file_to_s3(nex_session, ontology_file, source_to_id, edam_to_id): gzip_file = ontology_file + ".gz" import gzip import shutil with open(ontology_file, 'rb') as f_in, gzip.open(gzip_file, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) local_file = open(gzip_file, mode='rb') import hashlib go_md5sum = hashlib.md5(ontology_file.encode()).hexdigest() go_row = nex_session.query(Filedbentity).filter_by( md5sum=go_md5sum).one_or_none() if go_row is not None: return nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').update( {"dbentity_status": 'Archived'}) nex_session.commit() data_id = edam_to_id.get('EDAM:2353') ## data:2353 Ontology data topic_id = edam_to_id.get( 'EDAM:0089') ## topic:0089 Ontology and terminology format_id = edam_to_id.get('EDAM:3262') ## format:3262 OWL/XML from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='Core Gene Ontology in OWL RDF/XML format', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='0', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=go_md5sum)
def update_database_load_file_to_s3(nex_session, data_file, sgd_source_id, edam_to_id): local_file = open(data_file) import hashlib dx_md5sum = hashlib.md5(local_file.read()).hexdigest() dx_row = nex_session.query(Filedbentity).filter_by( md5sum=dx_md5sum).one_or_none() if dx_row is not None: return log.info("Uploading the file to S3...") data_file = data_file.split('/').pop() nex_session.query(Dbentity).filter_by(display_name=data_file, dbentity_status='Active').update( {"dbentity_status": 'Archived'}) nex_session.commit() data_id = edam_to_id.get('EDAM:2872') ## data:2872 ID list topic_id = edam_to_id.get( 'EDAM:3345') ## topic:3345 Data identity and mapping format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) upload_file( CREATED_BY, local_file, filename=data_file, file_extension='txt', description='subset of NCBI gene2accession file for taxon ID 559292', display_name=data_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='0', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=sgd_source_id)
def create_and_upload_file(obj, row_num): try: # find on local system if DATA_DIR: remote_file_path = DATA_DIR + '/' + obj['bun_path'] else: remote_file_path = obj['bun_path'] remote_file_path = remote_file_path + obj['display_name'] remote_file = open(remote_file_path) except IOError: logging.error('error opening file ' + str(row_num)) traceback.print_exc() return try: temp_engine = create_engine(NEX2_URI) session_factory = sessionmaker(bind=temp_engine, extension=ZopeTransactionExtension(), expire_on_commit=False) db_session = scoped_session(session_factory) # get README location readme_file_id = None if len(obj['readme_name']): readme = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['readme_name']).one_or_none() if readme is None: logging.warning('unable to find README ' + obj['readme_name']) else: readme_file_id = readme.dbentity_id # see if already exists, if not create existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() source_id = db_session.query(Source.source_id).filter( Source.display_name == obj['source']).one_or_none()[0] if not existing: try: data_id = db_session.query(Edam.edam_id).filter( Edam.edamid == obj['data_edam_id']).one_or_none()[0] format_id = db_session.query(Edam.edam_id).filter( Edam.edamid == obj['format_edam_id']).one_or_none()[0] topic_id = db_session.query(Edam.edam_id).filter( Edam.edamid == obj['topic_edam_id']).one_or_none()[0] except TypeError: logging.error('invalid EDAM id or source in row ' + str(row_num) + ' val in ' + obj['data_edam_id'] + ', ' + obj['format_edam_id'] + ', ' + obj['topic_edam_id']) return print("remote_file=", remote_file) upload_file(CREATED_BY, remote_file, filename=obj['display_name'], file_extension=obj['file_extension'], description=obj['description'].replace('"', ''), display_name=obj['display_name'], data_id=data_id, format_id=format_id, status=obj['status'], topic_id=topic_id, is_public=obj['is_public'], is_in_spell=obj['is_in_spell'], is_in_browser=obj['is_in_browser'], file_date=obj['file_date'], readme_file_id=readme_file_id, source_id=source_id) db_session.flush() else: existing.display_name = obj['display_name'] existing.description = obj['description'] existing.status = obj['status'] existing.is_public = obj['is_public'] existing.is_in_spell = obj['is_in_spell'] existing.is_in_browser = obj['is_in_browser'] existing.source_id = source_id # update file size if not existing.file_size and existing.s3_url: remote_file.seek(0, os.SEEK_END) file_size = remote_file.tell() remote_file.seek(0) existing.file_size = file_size if obj['file_date']: existing.file_date = obj['file_date'] existing.year = obj['file_date'].year existing.readme_file_id = readme_file_id remote_file.seek(0, os.SEEK_END) transaction.commit() existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() # only upload s3 file if not defined if existing.s3_url is None: existing.upload_file_to_s3(remote_file, obj['display_name']) db_session.flush() # add path entries existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() if not existing: logging.error('error with ' + obj['display_name'] + ' in row ' + str(row_num)) return path = db_session.query(Path).filter_by( path=obj['new_path']).one_or_none() if path is None: logging.warning('Could not find path ' + obj['new_path'] + ' in row ' + str(row_num)) return existing_filepath = db_session.query(FilePath).filter( and_(FilePath.file_id == existing.dbentity_id, FilePath.path_id == path.path_id)).one_or_none() if not existing_filepath: new_filepath = FilePath(file_id=existing.dbentity_id, path_id=path.path_id, source_id=SGD_SOURCE_ID, created_by=CREATED_BY) db_session.add(new_filepath) transaction.commit() db_session.flush() # maybe add PMIDs if len(obj['pmids']): existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() pmids = obj['pmids'].split('|') for x in pmids: x = int(x.strip()) existing_ref_file = db_session.query(ReferenceFile).filter( ReferenceFile.file_id == existing.dbentity_id).one_or_none() ref = db_session.query(Referencedbentity).filter( Referencedbentity.pmid == x).one_or_none() if ref and not existing_ref_file: new_ref_file = ReferenceFile(created_by=CREATED_BY, file_id=existing.dbentity_id, reference_id=ref.dbentity_id, source_id=SGD_SOURCE_ID) db_session.add(new_ref_file) transaction.commit() db_session.flush() # maybe add keywords if len(obj['keywords']): existing = db_session.query(Filedbentity).filter( Filedbentity.display_name == obj['display_name']).one_or_none() keywords = obj['keywords'].split('|') for x in keywords: x = x.strip() keyword = db_session.query(Keyword).filter( Keyword.display_name == x).one_or_none() existing_file_keyword = db_session.query(FileKeyword).filter( and_(FileKeyword.file_id == existing.dbentity_id, FileKeyword.keyword_id == keyword.keyword_id)).one_or_none() if not existing_file_keyword: new_file_keyword = FileKeyword( created_by=CREATED_BY, file_id=existing.dbentity_id, keyword_id=keyword.keyword_id, source_id=SGD_SOURCE_ID) db_session.add(new_file_keyword) transaction.commit() db_session.flush() remote_file.close() logging.info('finished ' + obj['display_name'] + ', line ' + str(row_num)) except: logging.error('error with ' + obj['display_name'] + ' in row ' + str(row_num)) traceback.print_exc() db_session.rollback() db_session.close()
def update_database_load_file_to_s3(nex_session, gzip_file, source_to_id, edam_to_id): local_file = open(gzip_file, mode='rb') import hashlib file_md5sum = hashlib.md5(local_file.read()).hexdigest() row = nex_session.query(Filedbentity).filter_by( md5sum=file_md5sum).one_or_none() if row is not None: return if "tbl" in gzip_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('ncbi_tbl_files.%.tar.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') elif "sqn" in gzip_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('ncbi_sqn_files.%.tar.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') else: nex_session.query(Dbentity).filter( Dbentity.display_name.like('ncbi_gbf_files.%.tar.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:3671') ## data:3671 Text topic_id = edam_to_id.get('EDAM:0085') ## topic:0085 Functional genomics format_id = edam_to_id.get('EDAM:3507') ## format:3507 Document format if "tbl" in gzip_file: from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) # readme = nex_session.query(Dbentity).filter_by(display_name="ncbi_tab_files.README", dbentity_status='Active').one_or_none() # if readme is None: # log.info("ncbi_tbl_files.README is not in the database.") # return # readme_file_id = readme.dbentity_id readme_file_id = None # path.path = /reports/function upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='All yeast features in tbl file format', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=readme_file_id, is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=file_md5sum) file = nex_session.query(Dbentity).filter_by( display_name=gzip_file, dbentity_status='Active').one_or_none() if file is None: log.info("The " + gzip_file + " is not in the database.") return file_id = file.dbentity_id path = nex_session.query(Path).filter_by( path="/reports/function").one_or_none() if path is None: log.info("The path /reports/function is not in the database.") return path_id = path.path_id x = FilePath(file_id=file_id, path_id=path_id, source_id=source_to_id['SGD'], created_by=CREATED_BY) nex_session.add(x) nex_session.commit()
def update_database_load_file_to_s3(nex_session, gaf_file, is_public, source_to_id, edam_to_id, datestamp): # gene_association.sgd.20171204.gz # gene_association.sgd-yeastmine.20171204.gz # datestamp = str(datetime.now()).split(" ")[0].replace("-", "") gzip_file = gaf_file + "." + datestamp + ".gz" import gzip import shutil with open(gaf_file, 'rb') as f_in, gzip.open(gzip_file, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) local_file = open(gzip_file) import hashlib gaf_md5sum = hashlib.md5(local_file.read()).hexdigest() row = nex_session.query(Filedbentity).filter_by( md5sum=gaf_md5sum).one_or_none() if row is not None: return gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "") # nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').update({"dbentity_status": 'Archived'}) if is_public == 1: nex_session.query(Dbentity).filter( Dbentity.display_name.like('gene_association.sgd%')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:2048') ## data:2048 Report topic_id = edam_to_id.get('EDAM:0085') ## topic:0085 Functional genomics format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV if "yeastmine" not in gaf_file: from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) readme = nex_session.query(Dbentity).filter_by( display_name="gene_association.README", dbentity_status='Active').one_or_none() if readme is None: log.info("gene_association.README is not in the database.") return readme_file_id = readme.dbentity_id # path.path = /reports/function upload_file( CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description= 'All GO annotations for yeast genes (protein and RNA) in GAF file format', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=readme_file_id, is_public=is_public, is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD']) gaf = nex_session.query(Dbentity).filter_by( display_name=gzip_file, dbentity_status='Active').one_or_none() if gaf is None: log.info("The " + gzip_file + " is not in the database.") return file_id = gaf.dbentity_id path = nex_session.query(Path).filter_by( path="/reports/function").one_or_none() if path is None: log.info("The path /reports/function is not in the database.") return path_id = path.path_id x = FilePath(file_id=file_id, path_id=path_id, source_id=source_to_id['SGD'], created_by=CREATED_BY) nex_session.add(x) nex_session.commit()
def update_database_load_file_to_s3(nex_session, gpad_file, gpi_file, source_to_id, edam_to_id): import hashlib gpad_local_file = open(gpad_file, mode='rb') gpi_local_file = open(gpi_file, mode='rb') gpad_md5sum = hashlib.md5(gpad_file.encode()).hexdigest() gpi_md5sum = hashlib.md5(gpi_file.encode()).hexdigest() gpad_row = nex_session.query(Filedbentity).filter_by( md5sum=gpad_md5sum).one_or_none() gpi_row = nex_session.query(Filedbentity).filter_by( md5sum=gpi_md5sum).one_or_none() if gpad_row is not None and gpi_row is not None: return if gpad_row is None: nex_session.query(Dbentity).filter_by( display_name=gpad_file, dbentity_status='Active').update({"dbentity_status": 'Archived'}) nex_session.commit() if gpi_row is None: nex_session.query(Dbentity).filter_by( display_name=gpi_file, dbentity_status='Active').update({"dbentity_status": 'Archived'}) nex_session.commit() data_id = edam_to_id.get('EDAM:2353') ## data:2353 Ontology data topic_id = edam_to_id.get( 'EDAM:0089') ## topic:0089 Ontology and terminology format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) if gpad_row is None: upload_file(CREATED_BY, gpad_local_file, filename=gpad_file, file_extension='.gz', description='Gene Product Association Data (GPAD)', display_name=gpad_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='0', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gpad_md5sum) if gpi_row is None: upload_file(CREATED_BY, gpi_local_file, filename=gpi_file, file_extension='gz', description='Gene Product Information (GPI)', display_name=gpi_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='0', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gpi_md5sum)
def update_database_load_file_to_s3(nex_session, gff_file, gzip_file, source_to_id, edam_to_id): local_file = open(gzip_file, mode='rb') ### upload a current GFF file to S3 with a static URL for Go Community ### upload_gff_to_s3(local_file, "latest/saccharomyces_cerevisiae.gff.gz") ########################################################################## import hashlib gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest() row = nex_session.query(Filedbentity).filter_by(md5sum = gff_md5sum).one_or_none() if row is not None: return gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "") nex_session.query(Dbentity).filter(Dbentity.display_name.like('saccharomyces_cerevisiae.%.gff.gz')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:3671') ## data:3671 Text topic_id = edam_to_id.get('EDAM:3068') ## topic:3068 Literature and language format_id = edam_to_id.get('EDAM:3507') ## format:3507 Document format from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) readme = nex_session.query(Dbentity).filter_by(display_name="saccharomyces_cerevisiae_gff.README", dbentity_status='Active').one_or_none() if readme is None: log.info("saccharomyces_cerevisiae_gff.README is not in the database.") return readme_file_id = readme.dbentity_id # path.path = /reports/chromosomal-features upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='GFF file for yeast genes (protein and RNA)', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=readme_file_id, is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gff_md5sum) gff = nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').one_or_none() if gff is None: log.info("The " + gzip_file + " is not in the database.") return file_id = gff.dbentity_id path = nex_session.query(Path).filter_by(path="/reports/chromosomal-features").one_or_none() if path is None: log.info("The path: /reports/chromosomal-features is not in the database.") return path_id = path.path_id x = FilePath(file_id = file_id, path_id = path_id, source_id = source_to_id['SGD'], created_by = CREATED_BY) nex_session.add(x) nex_session.commit() log.info("Done uploading " + gff_file)
def update_database_load_file_to_s3(nex_session, go_file, source_to_id, edam_to_id, ENGINE_CREATED): import hashlib desc = "Gene Product Association Data (GPAD)" if "gp_information" in go_file: desc = "Gene Product Information (GPI)" go_local_file = open(go_file, mode='rb') go_md5sum = hashlib.md5(go_file.encode()).hexdigest() go_row = nex_session.query(Filedbentity).filter_by( md5sum=go_md5sum).one_or_none() if go_row is not None: log.info("The current version of " + go_file + " is already in the database.\n") return log.info("Adding " + go_file + " to the database.\n") if "gp_association" in go_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('gp_association.559292_sgd%')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') elif "gp_information" in go_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('gp_information.559292_sgd%')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') elif "noctua_sgd.gpad" in go_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('noctua_sgd.gpad%')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:2353') ## data:2353 Ontology data topic_id = edam_to_id.get( 'EDAM:0089') ## topic:0089 Ontology and terminology format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV # if ENGINE_CREATED == 0: from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) if go_row is None: upload_file(CREATED_BY, go_local_file, filename=go_file, file_extension='.gz', description=desc, display_name=go_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=go_md5sum)
def update_database_load_file_to_s3(nex_session, data_file, gzip_file, source_to_id, edam_to_id): local_file = open(gzip_file, mode='rb') import hashlib gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest() row = nex_session.query(Filedbentity).filter_by( md5sum=gff_md5sum).one_or_none() if row is not None: return gzip_file = gzip_file.replace("scripts/dumping/ncbi/data/", "") nex_session.query(Dbentity).filter( Dbentity.display_name.like('RNAcentral.%.json.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:3495') # data:3495 RNA sequence topic_id = edam_to_id.get('EDAM:0099') # topic:0099 RNA format_id = edam_to_id.get('EDAM:3464') # format:3464 JSON format from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='JSON file for yeast RNA genes', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=None, is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gff_md5sum) rnaFile = nex_session.query(Dbentity).filter_by( display_name=gzip_file, dbentity_status='Active').one_or_none() if rnaFile is None: log.info("The " + gzip_file + " is not in the database.") return file_id = rnaFile.dbentity_id path = nex_session.query(Path).filter_by( path="/reports/chromosomal-features").one_or_none() if path is None: log.info( "The path: /reports/chromosomal-features is not in the database.") return path_id = path.path_id x = FilePath(file_id=file_id, path_id=path_id, source_id=source_to_id['SGD'], created_by=CREATED_BY) nex_session.add(x) nex_session.commit() log.info("Done uploading " + data_file)