예제 #1
0
def add_path_entries(file_name, file_path, src_id, uname):
    """ add paths to file_path table """

    try:
        existing = DBSession.query(Filedbentity).filter(
            Filedbentity.display_name == file_name).one_or_none()
        if not existing:
            logging.error('error with ' + file_name)
        path = DBSession.query(Path).filter_by(path=file_path).one_or_none()

        if path is None:
            logging.warning('Could not find path ')
        else:
            if existing:
                existing_filepath = DBSession.query(FilePath).filter(
                    and_(FilePath.file_id == existing.dbentity_id,
                         FilePath.path_id == path.path_id)).one_or_none()

                if not existing_filepath:
                    new_filepath = FilePath(file_id=existing.dbentity_id,
                                            path_id=path.path_id,
                                            source_id=src_id,
                                            created_by=uname)
                    DBSession.add(new_filepath)

    except Exception as e:
        logging.error("Exception occurred", exc_info=True)
def create_and_upload_file(obj, row_num):
    try:
        # find on local system
        if DATA_DIR:
            remote_file_path = DATA_DIR + '/' + obj['bun_path']
        else:
            remote_file_path = obj['bun_path']
        remote_file_path = remote_file_path + obj['display_name']
        remote_file = open(remote_file_path)
    except IOError:
        logging.error('error opening file ' + str(row_num))
        traceback.print_exc()
        return
    try:
        temp_engine = create_engine(NEX2_URI)
        session_factory = sessionmaker(bind=temp_engine,
                                       extension=ZopeTransactionExtension(),
                                       expire_on_commit=False)
        db_session = scoped_session(session_factory)
        # get README location
        readme_file_id = None
        if len(obj['readme_name']):
            readme = db_session.query(Filedbentity).filter(
                Filedbentity.display_name == obj['readme_name']).one_or_none()
            if readme is None:
                logging.warning('unable to find README ' + obj['readme_name'])
            else:
                readme_file_id = readme.dbentity_id

        # see if already exists, if not create
        existing = db_session.query(Filedbentity).filter(
            Filedbentity.display_name == obj['display_name']).one_or_none()

        source_id = db_session.query(Source.source_id).filter(
            Source.display_name == obj['source']).one_or_none()[0]

        if not existing:
            try:
                data_id = db_session.query(Edam.edam_id).filter(
                    Edam.edamid == obj['data_edam_id']).one_or_none()[0]
                format_id = db_session.query(Edam.edam_id).filter(
                    Edam.edamid == obj['format_edam_id']).one_or_none()[0]
                topic_id = db_session.query(Edam.edam_id).filter(
                    Edam.edamid == obj['topic_edam_id']).one_or_none()[0]
            except TypeError:
                logging.error('invalid EDAM id or source in row ' +
                              str(row_num) + ' val in ' + obj['data_edam_id'] +
                              ', ' + obj['format_edam_id'] + ', ' +
                              obj['topic_edam_id'])
                return

            print("remote_file=", remote_file)

            upload_file(CREATED_BY,
                        remote_file,
                        filename=obj['display_name'],
                        file_extension=obj['file_extension'],
                        description=obj['description'].replace('"', ''),
                        display_name=obj['display_name'],
                        data_id=data_id,
                        format_id=format_id,
                        status=obj['status'],
                        topic_id=topic_id,
                        is_public=obj['is_public'],
                        is_in_spell=obj['is_in_spell'],
                        is_in_browser=obj['is_in_browser'],
                        file_date=obj['file_date'],
                        readme_file_id=readme_file_id,
                        source_id=source_id)
            db_session.flush()
        else:
            existing.display_name = obj['display_name']
            existing.description = obj['description']
            existing.status = obj['status']
            existing.is_public = obj['is_public']
            existing.is_in_spell = obj['is_in_spell']
            existing.is_in_browser = obj['is_in_browser']
            existing.source_id = source_id
            # update file size
            if not existing.file_size and existing.s3_url:
                remote_file.seek(0, os.SEEK_END)
                file_size = remote_file.tell()
                remote_file.seek(0)
                existing.file_size = file_size
            if obj['file_date']:
                existing.file_date = obj['file_date']
                existing.year = obj['file_date'].year
            existing.readme_file_id = readme_file_id
            remote_file.seek(0, os.SEEK_END)
            transaction.commit()
            existing = db_session.query(Filedbentity).filter(
                Filedbentity.display_name ==
                obj['display_name']).one_or_none()
            # only upload s3 file if not defined
            if existing.s3_url is None:
                existing.upload_file_to_s3(remote_file, obj['display_name'])
            db_session.flush()
        # add path entries
        existing = db_session.query(Filedbentity).filter(
            Filedbentity.display_name == obj['display_name']).one_or_none()
        if not existing:
            logging.error('error with ' + obj['display_name'] + ' in row ' +
                          str(row_num))
            return
        path = db_session.query(Path).filter_by(
            path=obj['new_path']).one_or_none()
        if path is None:
            logging.warning('Could not find path ' + obj['new_path'] +
                            ' in row ' + str(row_num))
            return
        existing_filepath = db_session.query(FilePath).filter(
            and_(FilePath.file_id == existing.dbentity_id,
                 FilePath.path_id == path.path_id)).one_or_none()
        if not existing_filepath:
            new_filepath = FilePath(file_id=existing.dbentity_id,
                                    path_id=path.path_id,
                                    source_id=SGD_SOURCE_ID,
                                    created_by=CREATED_BY)
            db_session.add(new_filepath)
            transaction.commit()
            db_session.flush()
        # maybe add PMIDs
        if len(obj['pmids']):
            existing = db_session.query(Filedbentity).filter(
                Filedbentity.display_name ==
                obj['display_name']).one_or_none()
            pmids = obj['pmids'].split('|')
            for x in pmids:
                x = int(x.strip())
                existing_ref_file = db_session.query(ReferenceFile).filter(
                    ReferenceFile.file_id ==
                    existing.dbentity_id).one_or_none()
                ref = db_session.query(Referencedbentity).filter(
                    Referencedbentity.pmid == x).one_or_none()
                if ref and not existing_ref_file:
                    new_ref_file = ReferenceFile(created_by=CREATED_BY,
                                                 file_id=existing.dbentity_id,
                                                 reference_id=ref.dbentity_id,
                                                 source_id=SGD_SOURCE_ID)
                    db_session.add(new_ref_file)
                transaction.commit()
                db_session.flush()
        # maybe add keywords
        if len(obj['keywords']):
            existing = db_session.query(Filedbentity).filter(
                Filedbentity.display_name ==
                obj['display_name']).one_or_none()
            keywords = obj['keywords'].split('|')
            for x in keywords:
                x = x.strip()
                keyword = db_session.query(Keyword).filter(
                    Keyword.display_name == x).one_or_none()
                existing_file_keyword = db_session.query(FileKeyword).filter(
                    and_(FileKeyword.file_id == existing.dbentity_id,
                         FileKeyword.keyword_id ==
                         keyword.keyword_id)).one_or_none()
                if not existing_file_keyword:
                    new_file_keyword = FileKeyword(
                        created_by=CREATED_BY,
                        file_id=existing.dbentity_id,
                        keyword_id=keyword.keyword_id,
                        source_id=SGD_SOURCE_ID)
                    db_session.add(new_file_keyword)
                transaction.commit()
                db_session.flush()
        remote_file.close()
        logging.info('finished ' + obj['display_name'] + ', line ' +
                     str(row_num))
    except:
        logging.error('error with ' + obj['display_name'] + ' in row ' +
                      str(row_num))
        traceback.print_exc()
        db_session.rollback()
        db_session.close()
예제 #3
0
def update_database_load_file_to_s3(nex_session, gzip_file, source_to_id,
                                    edam_to_id):

    local_file = open(gzip_file, mode='rb')

    import hashlib
    file_md5sum = hashlib.md5(local_file.read()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=file_md5sum).one_or_none()

    if row is not None:
        return

    if "tbl" in gzip_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('ncbi_tbl_files.%.tar.gz')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    elif "sqn" in gzip_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('ncbi_sqn_files.%.tar.gz')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    else:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('ncbi_gbf_files.%.tar.gz')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')

    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3671')  ## data:3671 Text
    topic_id = edam_to_id.get('EDAM:0085')  ## topic:0085 Functional genomics
    format_id = edam_to_id.get('EDAM:3507')  ## format:3507 Document format

    if "tbl" in gzip_file:
        from sqlalchemy import create_engine
        from src.models import DBSession
        engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
        DBSession.configure(bind=engine)

    # readme = nex_session.query(Dbentity).filter_by(display_name="ncbi_tab_files.README", dbentity_status='Active').one_or_none()
    # if readme is None:
    #    log.info("ncbi_tbl_files.README is not in the database.")
    #    return
    # readme_file_id = readme.dbentity_id

    readme_file_id = None

    # path.path = /reports/function

    upload_file(CREATED_BY,
                local_file,
                filename=gzip_file,
                file_extension='gz',
                description='All yeast features in tbl file format',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=readme_file_id,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=file_md5sum)

    file = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()
    if file is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = file.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/function").one_or_none()
    if path is None:
        log.info("The path /reports/function is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()
예제 #4
0
def update_database_load_file_to_s3(nex_session, gaf_file, is_public,
                                    source_to_id, edam_to_id, datestamp):

    # gene_association.sgd.20171204.gz
    # gene_association.sgd-yeastmine.20171204.gz

    # datestamp = str(datetime.now()).split(" ")[0].replace("-", "")
    gzip_file = gaf_file + "." + datestamp + ".gz"
    import gzip
    import shutil
    with open(gaf_file, 'rb') as f_in, gzip.open(gzip_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    local_file = open(gzip_file)

    import hashlib
    gaf_md5sum = hashlib.md5(local_file.read()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=gaf_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "")

    # nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').update({"dbentity_status": 'Archived'})

    if is_public == 1:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('gene_association.sgd%')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
        nex_session.commit()

    data_id = edam_to_id.get('EDAM:2048')  ## data:2048 Report
    topic_id = edam_to_id.get('EDAM:0085')  ## topic:0085 Functional genomics
    format_id = edam_to_id.get('EDAM:3475')  ## format:3475 TSV

    if "yeastmine" not in gaf_file:
        from sqlalchemy import create_engine
        from src.models import DBSession
        engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
        DBSession.configure(bind=engine)

    readme = nex_session.query(Dbentity).filter_by(
        display_name="gene_association.README",
        dbentity_status='Active').one_or_none()
    if readme is None:
        log.info("gene_association.README is not in the database.")
        return
    readme_file_id = readme.dbentity_id

    # path.path = /reports/function

    upload_file(
        CREATED_BY,
        local_file,
        filename=gzip_file,
        file_extension='gz',
        description=
        'All GO annotations for yeast genes (protein and RNA) in GAF file format',
        display_name=gzip_file,
        data_id=data_id,
        format_id=format_id,
        topic_id=topic_id,
        status='Active',
        readme_file_id=readme_file_id,
        is_public=is_public,
        is_in_spell='0',
        is_in_browser='0',
        file_date=datetime.now(),
        source_id=source_to_id['SGD'])

    gaf = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()
    if gaf is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = gaf.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/function").one_or_none()
    if path is None:
        log.info("The path /reports/function is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()
예제 #5
0
def update_database_load_file_to_s3(nex_session, gff_file, gzip_file, source_to_id, edam_to_id):

    local_file = open(gzip_file, mode='rb')
    
    ### upload a current GFF file to S3 with a static URL for Go Community ###
    upload_gff_to_s3(local_file, "latest/saccharomyces_cerevisiae.gff.gz")
    ##########################################################################

    import hashlib
    gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(md5sum = gff_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "")

    nex_session.query(Dbentity).filter(Dbentity.display_name.like('saccharomyces_cerevisiae.%.gff.gz')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3671')   ## data:3671    Text
    topic_id = edam_to_id.get('EDAM:3068')  ## topic:3068   Literature and language
    format_id = edam_to_id.get('EDAM:3507') ## format:3507  Document format

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)
    
    readme = nex_session.query(Dbentity).filter_by(display_name="saccharomyces_cerevisiae_gff.README", dbentity_status='Active').one_or_none()
    if readme is None:
        log.info("saccharomyces_cerevisiae_gff.README is not in the database.")
        return
    readme_file_id = readme.dbentity_id

    # path.path = /reports/chromosomal-features

    upload_file(CREATED_BY, local_file,
                filename=gzip_file,
                file_extension='gz',
                description='GFF file for yeast genes (protein and RNA)',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=readme_file_id,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=gff_md5sum)

    gff = nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').one_or_none()

    if gff is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = gff.dbentity_id

    path = nex_session.query(Path).filter_by(path="/reports/chromosomal-features").one_or_none()
    if path is None:
        log.info("The path: /reports/chromosomal-features is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id = file_id,
                 path_id = path_id,
                 source_id = source_to_id['SGD'],
                 created_by = CREATED_BY)

    nex_session.add(x)
    nex_session.commit()

    log.info("Done uploading " + gff_file)
예제 #6
0
def update_database_load_file_to_s3(nex_session, data_file, gzip_file,
                                    source_to_id, edam_to_id):

    local_file = open(gzip_file, mode='rb')

    import hashlib
    gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=gff_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/ncbi/data/", "")

    nex_session.query(Dbentity).filter(
        Dbentity.display_name.like('RNAcentral.%.json.gz')).filter(
            Dbentity.dbentity_status == 'Active').update(
                {"dbentity_status": 'Archived'}, synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3495')  # data:3495    RNA sequence
    topic_id = edam_to_id.get('EDAM:0099')  # topic:0099   RNA
    format_id = edam_to_id.get('EDAM:3464')  # format:3464  JSON format

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    upload_file(CREATED_BY,
                local_file,
                filename=gzip_file,
                file_extension='gz',
                description='JSON file for yeast RNA genes',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=None,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=gff_md5sum)

    rnaFile = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()

    if rnaFile is None:
        log.info("The " + gzip_file + " is not in the database.")
        return

    file_id = rnaFile.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/chromosomal-features").one_or_none()
    if path is None:
        log.info(
            "The path: /reports/chromosomal-features is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()

    log.info("Done uploading " + data_file)