def convert_qualifier_evidence(old_session_maker, new_session_maker): from model_new_schema.bioentity import Qualifierevidence as NewQualifierevidence, Bioentity as NewBioentity from model_old_schema.feature import Feature as OldFeature log = logging.getLogger('convert.bioentity_in_depth.qualifier_evidence') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(NewQualifierevidence).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['reference_id', 'experiment_id', 'strain', 'source', 'date_created', 'created_by', 'bioentity_id', 'qualifier'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab cached dictionaries id_to_bioentity = dict([(x.id, x) for x in new_session.query(NewBioentity).all()]) #Grab old objects old_session = old_session_maker() old_objs = old_session.query(OldFeature).options(joinedload('annotation')) for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_qualifier_evidence(old_obj, id_to_bioentity) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_reftype(old_session_maker, new_session_maker): from model_new_schema.reference import Reference as NewReference, Reftype as NewReftype from model_old_schema.reference import RefReftype as OldRefReftype log = logging.getLogger('convert.reference_in_depth.reftype') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(NewReftype).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Grab old objects old_session = old_session_maker() old_objs = old_session.query(OldRefReftype).options(joinedload('reftype')).all() #Values to check values_to_check = ['source'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab cached dictionaries reference_ids = set([x.id for x in new_session.query(NewReference).all()]) for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_reftype(old_obj, reference_ids) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_locus(old_session_maker, new_session_maker): from model_new_schema.bioentity import Locus as NewLocus from model_old_schema.feature import Feature as OldFeature log = logging.getLogger('convert.bioentity.locus') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(NewLocus).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['display_name', 'link', 'source', 'status', 'date_created', 'created_by', 'attribute', 'name_description', 'headline', 'description', 'dbxref', 'genetic_position', 'locus_type'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_session = old_session_maker() old_objs = old_session.query(OldFeature).options(joinedload('annotation')).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_locus(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_paragraph(new_session_maker): from model_new_schema.bioentity import Bioentity, Paragraph log = logging.getLogger('convert.regulation.paragraph') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() #Values to check values_to_check = ['text', 'date_created', 'created_by'] #Grab cached dictionaries key_to_bioentity = dict([(x.unique_key(), x) for x in new_session.query(Bioentity).all()]) #Grab all current objects current_objs = new_session.query(Paragraph).filter(Paragraph.class_type == 'REGULATION').all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) old_objs = break_up_file('/Users/kpaskov/final/Reg_Summary_Paragraphs04282013.txt') for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_paragraph(old_obj, key_to_bioentity) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_strain(old_session_maker, new_session_maker): from model_new_schema.evelement import Strain as NewStrain from model_old_schema.cv import CVTerm as OldCVTerm log = logging.getLogger('convert.evelements.strain') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(NewStrain).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['display_name', 'link', 'description', 'date_created', 'created_by'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_session = old_session_maker() old_objs = old_session.query(OldCVTerm).filter(OldCVTerm.cv_no==10).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_strain(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_book(old_session_maker, new_session_maker): from model_new_schema.reference import Book as NewBook from model_old_schema.reference import Book as OldBook log = logging.getLogger('convert.reference.book') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(NewBook).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['isbn', 'total_pages', 'publisher', 'publisher_location', 'created_by', 'date_created'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_session = old_session_maker() old_objs = old_session.query(OldBook).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_book(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_bioentitytabs(new_session_maker): from model_new_schema.bioentity import Locus, Bioentitytabs log = logging.getLogger('convert.bioentity_in_depth.bioentitytabs') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(Bioentitytabs).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['summary', 'history', 'literature', 'go', 'phenotype', 'interactions', 'expression', 'regulation', 'protein', 'wiki'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects new_session = new_session_maker() old_objs = new_session.query(Locus).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_bioentitytabs(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_author_reference(old_session_maker, new_session_maker, chunk_size): from model_new_schema.reference import Author as NewAuthor, Reference as NewReference, AuthorReference as NewAuthorReference from model_old_schema.reference import AuthorReference as OldAuthorReference, Author as OldAuthor log = logging.getLogger('convert.reference_in_depth.author_reference') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() #Values to check values_to_check = ['author_type'] #Grab cached dictionaries reference_ids = set([x.id for x in new_session.query(NewReference).all()]) #Simplify author conversion old_id_to_key = dict([(x.id, create_format_name(x.name)) for x in old_session.query(OldAuthor).all()]) new_key_to_id = dict([(x.unique_key(), x.id) for x in new_session.query(NewAuthor).all()]) old_id_to_new_id_author = dict([(x, new_key_to_id[y]) for x, y in old_id_to_key.iteritems()]) used_unique_keys = set() count = old_session.query(func.max(OldAuthorReference.id)).first()[0] num_chunks = ceil(1.0*count/chunk_size) min_id = 0 for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewAuthorReference).filter(NewAuthorReference.id >= min_id).filter(NewAuthorReference.id < min_id+chunk_size).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = old_session.query(OldAuthorReference).filter( OldAuthorReference.id >= min_id).filter( OldAuthorReference.id < min_id+chunk_size).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_author_reference(old_obj, old_id_to_new_id_author, reference_ids) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) used_unique_keys.add(unique_key) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id + chunk_size except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_evidence(new_session_maker, chunk_size): from model_new_schema.regulation import Regulationevidence from model_new_schema.evelement import Experiment from model_new_schema.bioentity import Bioentity from model_new_schema.reference import Reference log = logging.getLogger('convert.regulation.evidence') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() #Values to check values_to_check = ['experiment_id', 'reference_id', 'strain_id', 'source', 'conditions', 'bioentity1_id', 'bioentity2_id', 'date_created', 'created_by'] #Grab cached dictionaries key_to_experiment = dict([(x.unique_key(), x) for x in new_session.query(Experiment).all()]) key_to_bioent = dict([(x.unique_key(), x) for x in new_session.query(Bioentity).all()]) pubmed_to_reference_id = dict([(x.pubmed_id, x.id) for x in new_session.query(Reference).all()]) #Grab old objects data = break_up_file('/Users/kpaskov/final/yeastmine_regulation.tsv') count = len(data) num_chunks = ceil(1.0*count/chunk_size) min_id = 0 j = 0 for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(Regulationevidence).filter(Regulationevidence.id >= create_evidence_id(min_id)).filter(Regulationevidence.id < create_evidence_id(min_id+chunk_size)).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) old_objs = data[min_id:min_id+chunk_size] for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_evidence(old_obj, j, key_to_experiment, key_to_bioent, pubmed_to_reference_id) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) j = j + 1 #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_experiment(old_session_maker, new_session_maker): from model_new_schema.evelement import Experiment as NewExperiment from model_old_schema.cv import CVTerm as OldCVTerm log = logging.getLogger('convert.evelements.experiment') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(NewExperiment).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['display_name', 'link', 'description', 'date_created', 'created_by', 'eco_id'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_session = old_session_maker() old_objs = old_session.query(OldCVTerm).filter(OldCVTerm.cv_no==7).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_experiment(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Get experiments from regulation files experiment_names = set() rows = break_up_file('/Users/kpaskov/final/yeastmine_regulation.tsv') experiment_names.update([(row[4], row[5]) for row in rows]) i=0 for experiment_name, eco_id in experiment_names: newly_created_objs = create_experiment_from_reg_row(experiment_name, eco_id, i) for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) i = i+1 experiment_names = set() #Add experiments from binding files rows = break_up_file('/Users/kpaskov/final/yetfasco_data.txt', delimeter=';') for row in rows: if len(row) < 10: print row experiment_names.update([row[9][1:-1] for row in rows]) i=0 for experiment_name in experiment_names: newly_created_objs = create_experiment_from_binding_row(experiment_name, i) for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) i = i+1 #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) print 'Removed at end' output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_protein(old_session_maker, new_session_maker): from model_new_schema.bioentity import Bioentity as NewBioentity from model_new_schema.protein import Protein as NewProtein from model_old_schema.sequence import ProteinInfo as OldProteinInfo log = logging.getLogger('convert.bioentity.protein') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(NewProtein).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['display_name', 'link', 'source', 'status', 'date_created', 'created_by', 'link', 'locus_id', 'length', 'n_term_seq', 'c_term_seq'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_session = old_session_maker() old_objs = old_session.query(OldProteinInfo).all() #Grab cached dictionaries id_to_bioentity = dict([(x.id, x) for x in new_session.query(NewBioentity).all()]) for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_protein(old_obj, id_to_bioentity) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_interaction(new_session_maker, evidence_class, class_type, label, chunk_size, directed): from model_new_schema.auxiliary import Interaction from model_new_schema.bioentity import Bioentity log = logging.getLogger(label) log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() #Values to check values_to_check = ['display_name', 'bioentity1_id', 'bioentity2_id', 'evidence_count'] #Grab all current objects current_objs = new_session.query(Interaction).filter(Interaction.class_type == class_type).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Grab cached dictionaries id_to_bioent = dict([(x.id, x) for x in new_session.query(Bioentity).all()]) untouched_obj_ids = set(id_to_current_obj.keys()) used_unique_keys = set() #Precomp evidence count format_name_to_evidence_count = {} min_id = new_session.query(func.min(evidence_class.id)).first()[0] count = new_session.query(func.max(evidence_class.id)).first()[0] - min_id num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): more_old_objs = new_session.query(evidence_class).filter(evidence_class.id >= min_id).filter(evidence_class.id < min_id+chunk_size).all() interaction_precomp(format_name_to_evidence_count, more_old_objs, id_to_bioent, directed) min_id = min_id + chunk_size #Create interactions min_id = new_session.query(func.min(evidence_class.id)).first()[0] count = new_session.query(func.max(evidence_class.id)).first()[0] - min_id num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): old_objs = new_session.query(evidence_class).filter(evidence_class.id >= min_id).filter(evidence_class.id < min_id+chunk_size).all() for old_obj in old_objs: #Convert old objects into new ones if directed: format_name = create_directed_key(old_obj) else: format_name = create_undirected_interaction_format_name(old_obj, id_to_bioent) evidence_count = format_name_to_evidence_count[format_name] newly_created_objs = create_interaction(old_obj, evidence_count, id_to_bioent, directed) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_url(new_session_maker, chunk_size): from model_new_schema.reference import Reference, Referenceurl as NewReferenceurl log = logging.getLogger('convert.reference_in_depth.reference_url') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() #Values to check values_to_check = ['display_name', 'category', 'source', 'date_created', 'created_by', 'reference_id', 'url_type'] count = new_session.query(func.max(Reference.id)).first()[0] num_chunks = ceil(1.0*count/chunk_size) min_id = 0 for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewReferenceurl).filter(NewReferenceurl.reference_id >= min_id).filter(NewReferenceurl.reference_id <= min_id+chunk_size).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = new_session.query(Reference).filter( Reference.id >= min_id).filter( Reference.id <= min_id+chunk_size).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_url(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id + chunk_size + 1 #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_evidence(old_session_maker, new_session_maker, chunk_size): from model_new_schema.go import Goevidence as NewGoevidence from model_new_schema.reference import Reference as NewReference from model_new_schema.bioentity import Bioentity as NewBioentity from model_new_schema.go import Go as NewGo from model_old_schema.go import GoRef as OldGoRef log = logging.getLogger('convert.go.evidence') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() #Values to check values_to_check = ['experiment_id', 'reference_id', 'strain_id', 'source', 'go_evidence', 'annotation_type', 'date_last_reviewed', 'qualifier', 'bioentity_id', 'bioconcept_id', 'date_created', 'created_by'] #Grab cached dictionaries bioent_ids = set([x.id for x in new_session.query(NewBioentity).all()]) reference_ids = set([x.id for x in new_session.query(NewReference).all()]) key_to_go = dict([(x.unique_key(), x) for x in new_session.query(NewGo).all()]) already_used_keys = set() min_id = old_session.query(func.min(OldGoRef.id)).first()[0] count = old_session.query(func.max(OldGoRef.id)).first()[0] - min_id num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewGoevidence).filter(NewGoevidence.id >= create_evidence_id(min_id)).filter(NewGoevidence.id < create_evidence_id(min_id+chunk_size)).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = old_session.query(OldGoRef).filter( OldGoRef.id >= min_id).filter( OldGoRef.id < min_id+chunk_size).options( joinedload('go_annotation')).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_evidence(old_obj, key_to_go, reference_ids, bioent_ids) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: key = newly_created_obj.unique_key() if key not in already_used_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if key not in key_to_current_obj else key_to_current_obj[key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) already_used_keys.add(key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_bibentry(new_session_maker, chunk_size): from model_new_schema.reference import Reference as NewReference, Bibentry as NewBibentry, \ Journal as NewJournal, Book as NewBook, Abstract as NewAbstract, \ Reftype as NewReftype, Author as NewAuthor, AuthorReference as NewAuthorReference log = logging.getLogger('convert.reference_in_depth.bibentry') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() #Values to check values_to_check = ['text'] #Grab cached dictionaries id_to_journal = dict([(x.id, x) for x in new_session.query(NewJournal).all()]) id_to_book = dict([(x.id, x) for x in new_session.query(NewBook).all()]) id_to_abstract = dict([(x.id, x.text) for x in new_session.query(NewAbstract).all()]) id_to_authors = {} id_to_author = dict([(x.id, x) for x in new_session.query(NewAuthor).all()]) for ar in new_session.query(NewAuthorReference).all(): reference_id = ar.reference_id author_name = id_to_author[ar.author_id].display_name if reference_id in id_to_authors: id_to_authors[reference_id].add(author_name) else: id_to_authors[reference_id] = set([author_name]) id_to_reftypes = {} reftypes = new_session.query(NewReftype).all() for reftype in reftypes: reference_id = reftype.reference_id reftype_name = reftype.name if reference_id in id_to_reftypes: id_to_reftypes[reference_id].add(reftype_name) else: id_to_reftypes[reference_id] = set([author_name]) count = new_session.query(func.max(NewReference.id)).first()[0] num_chunks = ceil(1.0*count/chunk_size) min_id = 0 for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewBibentry).filter(NewBibentry.id >= min_id).filter(NewBibentry.id <= min_id+chunk_size).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = new_session.query(NewReference).filter( NewReference.id >= min_id).filter( NewReference.id <= min_id+chunk_size).options(joinedload('author_references')).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_bibentry(old_obj, id_to_journal, id_to_book, id_to_abstract, id_to_reftypes, id_to_authors) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id + chunk_size + 1 #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_evidence(old_session_maker, new_session_maker, chunk_size): from model_new_schema.phenotype import Phenotypeevidence as NewPhenotypeevidence from model_new_schema.reference import Reference as NewReference from model_new_schema.evelement import Experiment as NewExperiment, Strain as NewStrain from model_new_schema.bioentity import Bioentity as NewBioentity from model_new_schema.misc import Allele as NewAllele from model_new_schema.phenotype import Phenotype as NewPhenotype from model_old_schema.reference import Reflink as OldReflink from model_old_schema.phenotype import PhenotypeFeature as OldPhenotypeFeature log = logging.getLogger('convert.phenotype.evidence') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() #Values to check values_to_check = ['experiment_id', 'reference_id', 'strain_id', 'source', 'bioentity_id', 'bioconcept_id', 'date_created', 'created_by', 'reporter', 'reporter_desc', 'strain_details', 'conditions', 'details', 'experiment_details', 'allele_info', 'allele_id'] #Grab cached dictionaries key_to_experiment = dict([(x.unique_key(), x) for x in new_session.query(NewExperiment).all()]) key_to_phenotype = dict([(x.unique_key(), x) for x in new_session.query(NewPhenotype).all()]) key_to_strain = dict([(x.unique_key(), x) for x in new_session.query(NewStrain).all()]) key_to_allele = dict([(x.unique_key(), x) for x in new_session.query(NewAllele).all()]) bioent_ids = set([x.id for x in new_session.query(NewBioentity).all()]) reference_ids = set([x.id for x in new_session.query(NewReference).all()]) old_reflinks = old_session.query(OldReflink).all() key_to_reflink = dict([((x.col_name, x.primary_key), x) for x in old_reflinks]) min_id = old_session.query(func.min(OldPhenotypeFeature.id)).first()[0] count = old_session.query(func.max(OldPhenotypeFeature.id)).first()[0] - min_id num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewPhenotypeevidence).filter(NewPhenotypeevidence.id >= create_evidence_id(min_id)).filter(NewPhenotypeevidence.id < create_evidence_id(min_id+chunk_size)).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = old_session.query(OldPhenotypeFeature).filter( OldPhenotypeFeature.id >= min_id).filter( OldPhenotypeFeature.id < min_id+chunk_size).options( joinedload('experiment')).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_evidence(old_obj, key_to_reflink, key_to_phenotype, reference_ids, bioent_ids, key_to_strain, key_to_experiment, key_to_allele) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_litevidence(old_session_maker, new_session_maker, chunk_size): from model_new_schema.literature import Literatureevidence as NewLiteratureevidence from model_new_schema.reference import Reference as NewReference from model_new_schema.bioentity import Bioentity as NewBioentity from model_old_schema.reference import LitguideFeat as OldLitguideFeat log = logging.getLogger('convert.literature.evidence') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() #Values to check values_to_check = ['experiment_id', 'reference_id', 'class_type', 'strain_id', 'source', 'topic', 'bioentity_id', 'date_created', 'created_by'] #Grab cached dictionaries bioent_ids = set([x.id for x in new_session.query(NewBioentity).all()]) reference_ids = set([x.id for x in new_session.query(NewReference).all()]) min_id = old_session.query(func.min(OldLitguideFeat.id)).first()[0] count = old_session.query(func.max(OldLitguideFeat.id)).first()[0] - min_id num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewLiteratureevidence).filter(NewLiteratureevidence.id >= create_litevidence_id(min_id)).filter(NewLiteratureevidence.id < create_litevidence_id(min_id+chunk_size)).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = old_session.query(OldLitguideFeat).filter( OldLitguideFeat.id >= min_id).filter( OldLitguideFeat.id < min_id+chunk_size).filter( or_(OldLitguideFeat.topic=='Additional Literature', OldLitguideFeat.topic=='Primary Literature', OldLitguideFeat.topic=='Omics', OldLitguideFeat.topic=='Reviews')).options( joinedload('litguide')).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_litevidence(old_obj, reference_ids, bioent_ids) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_domain(new_session_maker, chunk_size): from model_new_schema.protein import Domain as Domain log = logging.getLogger('convert.protein.domain') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(Domain).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['display_name', 'description', 'interpro_id', 'interpro_description', 'link'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects data = break_up_file('/Users/kpaskov/final/yeastmine_protein_domains.tsv') used_unique_keys = set() min_id = 0 count = len(data) num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): old_objs = data[min_id:min_id+chunk_size] for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_domain(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size #Grab JASPAR domains from file old_objs = break_up_file('/Users/kpaskov/final/TF_family_class_accession04302013.txt') for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_domain_from_tf_file(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) output_creator.finished("1/1") new_session.commit() #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_domain_evidence(new_session_maker, chunk_size): from model_new_schema.protein import Domain, Domainevidence from model_new_schema.bioentity import Bioentity from model_new_schema.reference import Reference log = logging.getLogger('convert.protein.domain_evidence') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(Domainevidence).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['reference_id', 'strain_id', 'source', 'date_created', 'created_by', 'start', 'end', 'evalue', 'status', 'date_of_run', 'protein_id', 'domain_id'] #Grab cached dictionaries key_to_bioentity = dict([(x.unique_key(), x) for x in new_session.query(Bioentity).all()]) key_to_domain = dict([(x.unique_key(), x) for x in new_session.query(Domain).all()]) pubmed_id_to_reference_id = dict([(x.pubmed_id, x.id) for x in new_session.query(Reference).all()]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects data = break_up_file('/Users/kpaskov/final/yeastmine_protein_domains.tsv') used_unique_keys = set() j=0 min_id = 0 count = len(data) num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): old_objs = data[min_id:min_id+chunk_size] for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_domain_evidence(old_obj, j, key_to_bioentity, key_to_domain) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = (newly_created_obj.protein_id, newly_created_obj.domain_id, newly_created_obj.start, newly_created_obj.end, newly_created_obj.evalue) if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.id not in key_to_current_obj else key_to_current_obj[newly_created_obj.id] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) j = j+1 output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size #Grab JASPAR evidence from file old_objs = break_up_file('/Users/kpaskov/final/TF_family_class_accession04302013.txt') for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_domain_evidence_from_tf_file(old_obj, j, key_to_bioentity, key_to_domain, pubmed_id_to_reference_id) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) j = j+1 output_creator.finished("1/1") new_session.commit() #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_bioentity_reference(new_session_maker, evidence_class, class_type, label, chunk_size, get_bioent_ids_f, filter_f=None): from model_new_schema.auxiliary import BioentityReference from model_new_schema.bioentity import Paragraph log = logging.getLogger(label) log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() #Values to check values_to_check = [] #Grab all current objects current_objs = new_session.query(BioentityReference).filter(BioentityReference.class_type == class_type).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) used_unique_keys = set() min_id = new_session.query(func.min(evidence_class.id)).first()[0] count = new_session.query(func.max(evidence_class.id)).first()[0] - min_id num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): old_objs = new_session.query(evidence_class).filter(evidence_class.id >= min_id, evidence_class.id <= min_id+chunk_size).all() for old_obj in old_objs: if filter_f is None or filter_f(old_obj): #Convert old objects into new ones newly_created_objs = create_bioentity_reference(old_obj, get_bioent_ids_f, class_type) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size #Add paragraph-related bioent_references. old_objs = new_session.query(Paragraph).filter(Paragraph.class_type == class_type).options(joinedload('paragraph_references')).all() for old_obj in old_objs: if filter_f is None or filter_f(old_obj): #Convert old objects into new ones newly_created_objs = create_bioentity_reference_from_paragraph(old_obj, class_type) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_abstract(old_session_maker, new_session_maker, chunk_size): from model_new_schema.reference import Reference as NewReference, Abstract as NewAbstract from model_old_schema.reference import Reference as OldReference log = logging.getLogger('convert.reference_in_depth.abstract') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() #Values to check values_to_check = ['text'] #Grab cached dictionaries reference_ids = set([x.id for x in new_session.query(NewReference).all()]) count = old_session.query(func.max(OldReference.id)).first()[0] num_chunks = ceil(1.0*count/chunk_size) min_id = 0 for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewAbstract).filter(NewAbstract.id >= min_id).filter(NewAbstract.id <= min_id+chunk_size).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = old_session.query(OldReference).filter( OldReference.id >= min_id).filter( OldReference.id <= min_id+chunk_size).options( joinedload('abst')).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_abstract(old_obj, reference_ids) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id + chunk_size + 1 #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_reference(old_session_maker, new_session_maker, chunk_size): from model_new_schema.reference import Reference as NewReference, Book as NewBook, Journal as NewJournal from model_old_schema.reference import Reference as OldReference log = logging.getLogger('convert.reference.reference') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() #Values to check values_to_check = ['display_name', 'format_name', 'link', 'source', 'status', 'pubmed_id', 'pubmed_central_id', 'pdf_status', 'year', 'date_published', 'date_revised', 'issue', 'page', 'volume', 'title', 'journal_id', 'book_id', 'doi', 'created_by', 'date_created'] #Grab cached dictionaries key_to_journal = dict([(x.unique_key(), x) for x in new_session.query(NewJournal).all()]) key_to_book = dict([(x.unique_key(), x) for x in new_session.query(NewBook).all()]) #Grab old objects old_session = old_session_maker() used_unique_keys = set() count = old_session.query(func.max(OldReference.id)).first()[0] num_chunks = ceil(1.0*count/chunk_size) min_id = 0 for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewReference).filter(NewReference.id >= min_id).filter(NewReference.id <= min_id+chunk_size).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = old_session.query(OldReference).filter( OldReference.id >= min_id).filter( OldReference.id <= min_id+chunk_size).options( joinedload('book'), joinedload('journal')).all() old_pubmed_ids = [x.pubmed_id for x in old_objs if x.pubmed_id is not None] pubmed_id_to_pubmed_central_id = get_pubmed_central_ids(old_pubmed_ids) for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_reference(old_obj, key_to_journal, key_to_book, pubmed_id_to_pubmed_central_id) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) used_unique_keys.add(unique_key) output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id + chunk_size + 1 #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_url(old_session_maker, new_session_maker, chunk_size): from model_new_schema.bioentity import Bioentity as NewBioentity, Bioentityurl as NewBioentityurl from model_old_schema.general import WebDisplay as OldWebDisplay, FeatUrl as OldFeatUrl, DbxrefFeat as OldDbxrefFeat log = logging.getLogger('convert.bioentity_in_depth.bioentity_url') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() #Values to check values_to_check = ['display_name', 'source', 'created_by', 'date_created'] #Grab cached dictionaries id_to_bioentity = dict([(x.id, x) for x in new_session.query(NewBioentity).all()]) #Urls of interest old_web_displays = old_session.query(OldWebDisplay).filter(OldWebDisplay.label_location == 'Interaction Resources').all() url_to_display = dict([(x.url_id, x) for x in old_web_displays]) count = max(id_to_bioentity.keys()) num_chunks = ceil(1.0*count/chunk_size) min_id = 0 for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewBioentityurl).filter(NewBioentityurl.bioentity_id >= min_id).filter(NewBioentityurl.bioentity_id < min_id+chunk_size).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = old_session.query(OldFeatUrl).filter(OldFeatUrl.feature_id >= min_id).filter(OldFeatUrl.feature_id < min_id+chunk_size).options(joinedload('url')).all() for old_obj in old_objs: #Convert old objects into new ones if old_obj.url_id in url_to_display: newly_created_objs = create_url(old_obj, url_to_display[old_obj.url_id], id_to_bioentity) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Grab old objects (dbxref) old_objs = old_session.query(OldDbxrefFeat).filter( OldDbxrefFeat.feature_id >= min_id).filter( OldDbxrefFeat.feature_id < min_id+chunk_size).options( joinedload('dbxref'), joinedload('dbxref.dbxref_urls')).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_url_from_dbxref(old_obj, url_to_display, id_to_bioentity) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id + chunk_size except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_experiment_alias(old_session_maker, new_session_maker): from model_new_schema.evelement import Experiment as NewExperiment, Experimentalias as NewExperimentalias from model_old_schema.cv import CVTerm as OldCVTerm log = logging.getLogger('convert.evelements.experiment_alias') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(NewExperimentalias).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['source', 'category', 'date_created', 'created_by'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab cached dictionaries key_to_experiment = dict([(x.unique_key(), x) for x in new_session.query(NewExperiment).all()]) #Grab old objects old_session = old_session_maker() old_objs = old_session.query(OldCVTerm).filter(OldCVTerm.cv_no==7).options( joinedload('cv_dbxrefs'), joinedload('cv_dbxrefs.dbxref')).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_experiment_alias(old_obj, key_to_experiment) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_evidence_chemical(old_session_maker, new_session_maker, chunk_size): from model_new_schema.phenotype import Phenotypeevidence as NewPhenotypeevidence from model_new_schema.chemical import Chemical as NewChemical from model_new_schema.evidence import EvidenceChemical as NewEvidenceChemical from model_old_schema.phenotype import PhenotypeFeature as OldPhenotypeFeature log = logging.getLogger('convert.phenotype.evidence_chemical') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() #Values to check values_to_check = ['chemical_amt'] #Grab cached dictionaries key_to_chemical = dict([(x.unique_key(), x) for x in new_session.query(NewChemical).all()]) min_id = old_session.query(func.min(OldPhenotypeFeature.id)).first()[0] count = old_session.query(func.max(OldPhenotypeFeature.id)).first()[0] - min_id num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewEvidenceChemical).filter(NewEvidenceChemical.evidence_id >= create_evidence_id(min_id)).filter(NewEvidenceChemical.evidence_id < create_evidence_id(min_id+chunk_size)).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) id_to_evidence = dict([(x.id, x) for x in new_session.query(NewPhenotypeevidence).filter(NewPhenotypeevidence.id >= create_evidence_id(min_id)).filter(NewPhenotypeevidence.id < create_evidence_id(min_id+chunk_size)).all()]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = old_session.query(OldPhenotypeFeature).filter( OldPhenotypeFeature.id >= min_id).filter( OldPhenotypeFeature.id < min_id+chunk_size).options( joinedload('experiment')).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_evidence_chemical(old_obj, key_to_chemical, id_to_evidence) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_interaction_family(new_session_maker, chunk_size): from model_new_schema.auxiliary import Interaction, InteractionFamily from model_new_schema.bioentity import Bioentity log = logging.getLogger('convert.interaction.interaction_family') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() #Values to check values_to_check = ['bioentity1_id', 'bioentity2_id', 'genetic_ev_count', 'physical_ev_count', 'evidence_count'] #Grab cached dictionaries id_to_bioent = dict([(x.id, x) for x in new_session.query(Bioentity).all()]) #Grab old objs interactions = new_session.query(Interaction).filter(or_(Interaction.class_type == 'PHYSINTERACTION', Interaction.class_type == 'GENINTERACTION')).all() bioent_id_to_evidence_cutoff, bioent_id_to_neighbor_ids, edge_to_counts = interaction_family_precomp(interactions, 100, id_to_bioent) min_id = 0 count = new_session.query(func.max(Bioentity.id)).first()[0] num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(InteractionFamily).filter(InteractionFamily.bioentity_id >= min_id).filter(InteractionFamily.bioentity_id < min_id + chunk_size).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) old_objs = new_session.query(Bioentity).filter(Bioentity.id >= min_id).filter(Bioentity.id < min_id+chunk_size).all() for old_obj in old_objs: #Convert old objects into new ones evidence_cutoff = bioent_id_to_evidence_cutoff[old_obj.id] newly_created_objs = create_interaction_family(old_obj, evidence_cutoff, bioent_id_to_neighbor_ids, edge_to_counts, id_to_bioent) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_allele(old_session_maker, new_session_maker): from model_new_schema.phenotype import Allele as NewAllele from model_old_schema.phenotype import PhenotypeFeature as OldPhenotypeFeature log = logging.getLogger('convert.chemical.allele') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() #Grab all current objects current_objs = new_session.query(NewAllele).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['description', 'display_name'] untouched_obj_ids = set(id_to_current_obj.keys()) keys_already_seen = set() #Grab old objects old_objs = old_session.query(OldPhenotypeFeature).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_allele(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: key = newly_created_obj.unique_key() if key not in keys_already_seen: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if key not in key_to_current_obj else key_to_current_obj[key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) keys_already_seen.add(key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_genetic_interevidence(old_session_maker, new_session_maker, chunk_size): from model_new_schema.interaction import Geninteractionevidence as NewGeninteractionevidence from model_new_schema.reference import Reference as NewReference from model_new_schema.evelement import Experiment as NewExperiment from model_new_schema.bioentity import Bioentity as NewBioentity from model_new_schema.phenotype import Phenotype as NewPhenotype from model_old_schema.interaction import Interaction as OldInteraction log = logging.getLogger("convert.genetic_interaction.evidence") log.info("begin") output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() # Values to check values_to_check = [ "experiment_id", "reference_id", "strain_id", "source", "bioentity1_id", "bioentity2_id", "phenotype_id", "note", "annotation_type", "date_created", "created_by", ] # Grab cached dictionaries key_to_experiment = dict([(x.unique_key(), x) for x in new_session.query(NewExperiment).all()]) key_to_phenotype = dict([(x.unique_key(), x) for x in new_session.query(NewPhenotype).all()]) bioent_ids = dict([(x.unique_key(), x) for x in new_session.query(NewBioentity).all()]) reference_ids = set([x.id for x in new_session.query(NewReference).all()]) min_id = old_session.query(func.min(OldInteraction.id)).first()[0] count = old_session.query(func.max(OldInteraction.id)).first()[0] - min_id num_chunks = ceil(1.0 * count / chunk_size) for i in range(0, num_chunks): # Grab all current objects current_objs = ( new_session.query(NewGeninteractionevidence) .filter(NewGeninteractionevidence.id >= create_genetic_evidence_id(min_id)) .filter(NewGeninteractionevidence.id < create_genetic_evidence_id(min_id + chunk_size)) .all() ) id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) # Grab old objects old_objs = ( old_session.query(OldInteraction) .filter(OldInteraction.id >= min_id) .filter(OldInteraction.id < min_id + chunk_size) .options( joinedload("interaction_references"), joinedload("interaction_phenotypes"), joinedload("feature_interactions"), ) ) for old_obj in old_objs: # Convert old objects into new ones newly_created_objs = create_genetic_interevidence( old_obj, key_to_experiment, key_to_phenotype, reference_ids, bioent_ids ) if newly_created_objs is not None: # Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = ( None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] ) current_obj_by_key = ( None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] ) create_or_update( newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator, ) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) # Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() # Commit output_creator.finished(str(i + 1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id + chunk_size except Exception: log.exception("Unexpected error:" + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info("complete")