def insert_abstract(pmid, reference_id, record, source_id, journal_abbrev, journal_title, issn_print, created_by): text = record.get('AB', '') if text == '': return locus_names_ids = DBSession.query(Locusdbentity.display_name, Locusdbentity.sgdid).all() html = link_gene_names(text, locus_names_ids) x = Referencedocument(document_type='Abstract', source_id=source_id, reference_id=reference_id, text=text, html=html, created_by=created_by) DBSession.add(x) entries = create_bibentry(pmid, record, journal_abbrev, journal_title, issn_print) y = Referencedocument(document_type='Medline', source_id=source_id, reference_id=reference_id, text='\n'.join([ key + ' - ' + str(value) for key, value in entries if value is not None ]), html='\n'.join([ key + ' - ' + str(value) for key, value in entries if value is not None ]), created_by=created_by) DBSession.add(y) DBSession.flush() DBSession.refresh(x)
def insert_abstract(pmid, reference_id, record, source_id, journal_abbrev, journal_title, issn_print, created_by): """ Add abstract to Referencedocument table This method does not return anything, just does the necessary CRUD operations Parameters ---------- pmid: int reference_id: int source_id: int journal_abbrev: str journal_title: str issn_print: str created_by: str Return ------ empty does not return anything """ text = record.get('AB', '') if text == '': return locus_names_ids = DBSession.query(Locusdbentity.display_name, Locusdbentity.sgdid).all() html = link_gene_names(text, locus_names_ids) x = Referencedocument(document_type='Abstract', source_id=source_id, reference_id=reference_id, text=text, html=html, created_by=created_by) DBSession.add(x) entries = create_bibentry(pmid, record, journal_abbrev, journal_title, issn_print) y = Referencedocument(document_type='Medline', source_id=source_id, reference_id=reference_id, text='\n'.join([ key + ' - ' + str(value) for key, value in entries if value is not None ]), html='\n'.join([ key + ' - ' + str(value) for key, value in entries if value is not None ]), created_by=created_by) DBSession.add(y) DBSession.flush() DBSession.refresh(x)
def validate_file_content_and_process(file_content, nex_session, username): ''' Check file content, process and save to db Parameters ---------- file_content: csv-reader object csv-reader reads a tvs file and returns an object nex_session: database_session object username: str authorized user to make CRUD operations Returns ------- dictionary number of entries number of updates database entries(dictionary) Note: Accepted summary types: Phenotype, Regulation, Disease, Interaction Sequence, Protein Checks correct number of columns in the header and valid IDs ''' header_literal = [ '# Feature', 'Summary Type (phenotype, regulation, disease, interaction, sequence, protein )', 'Summary', 'PMIDs' ] accepted_summary_types = [ 'Phenotype', 'Regulation', 'Disease', 'Interaction', 'Sequence', 'Protein' ] file_gene_ids = [] file_pmids = [] copied = [] already_used_genes = [] clear_target_urls = [] # use regex to get keys from dictionary key_feature = re.compile(r".*feature$", re.IGNORECASE) key_summary_type = re.compile(r"^summary type.*", re.IGNORECASE) # use regex to get keys from dictionary key_feature = re.compile(r".*feature$", re.IGNORECASE) key_summary_type = re.compile(r"^summary type.*", re.IGNORECASE) try: for item in file_content: if (len(item) != len(header_literal)): raise ValueError( 'Row or header has incorrect number of columns.') #TODO: abstract the loop below in the next release gene_id = '' summary_type = '' for k, v in item.items(): if key_feature.match(k): gene_id = item.get(k) if key_summary_type.match(k): summary_type = item.get(k) pmid_temp = item.get('PMIDs', None) if pmid_temp: pmids = str(pmid_temp).replace(' ', '').replace('0.0', '') else: pmids = '' summary_text = item.get('Summary', '') if gene_id: file_gene_ids.append(gene_id.strip()) if summary_type: gene_id_with_summary = gene_id + summary_type if gene_id_with_summary in already_used_genes: raise ValueError( 'The same gene summary cannot be updated twice in the same\ file: ' + str(gene_id)) already_used_genes.append(gene_id_with_summary) if summary_type.lower() not in ''.join( accepted_summary_types).lower(): raise ValueError( 'Unaccepted summary type. Must be one of ' + ', '.join(accepted_summary_types)) if len(pmids) > 0: pmids = re.split('\||,', pmids) for pmid in pmids: file_pmids.append(str(pmid)) copied.append(item) except IndexError: raise ValueError( 'The file is not a valid TSV with the correct number of columns. Check the file and try again.' ) nex_session.execute('SET LOCAL ROLE ' + username) # check that gene names are valid valid_genes = nex_session.query(Locusdbentity.format_name).filter( Locusdbentity.format_name.in_(file_gene_ids)).all() valid_genes = [str(d[0]) for d in valid_genes] invalid_genes = [d for d in file_gene_ids if d not in valid_genes] if len(invalid_genes): raise ValueError('Invalid gene identifier: ' + ', '.join(invalid_genes)) # must be valid PMIDs in last column or nothing matching_refs = nex_session.query(Referencedbentity).filter( Referencedbentity.pmid.in_(file_pmids)).all() temp_matching_refs = [str(d.pmid) for d in matching_refs] invalid_refs = [d for d in file_pmids if d not in temp_matching_refs] if len(invalid_refs): # raise ValueError('Invalid PMID: ' + ', '.join(invalid_refs) + '. Must be a pipe-separated list of PMIDs from SGD.') print(len(invalid_refs)) # update receipt_entries = [] locus_names_ids = nex_session.query(Locusdbentity.display_name, Locusdbentity.sgdid).all() inserts = 0 updates = 0 for item in copied: if item: for k, v in item.items(): if key_feature.match(k): file_id = item.get(k) if key_summary_type.match(k): file_summary_type = item.get(k) #file_id = item.get('# Feature', '') #file_summary_type = item.get( # 'Summary Type (phenotype, regulation)', '') file_summary_val = item.get('Summary', '') file_summary_html = link_gene_names(file_summary_val, locus_names_ids) if file_id: gene = nex_session.query(Locusdbentity).filter_by( format_name=file_id).one_or_none() if file_summary_type: summaries = nex_session.query( Locussummary.summary_type, Locussummary.summary_id, Locussummary.html, Locussummary.date_created).filter_by( locus_id=gene.dbentity_id, summary_type=file_summary_type).all() # update summary = None if len(summaries): summary = summaries[0] nex_session.query(Locussummary).filter_by( summary_id=summary.summary_id).update({ 'text': file_summary_val, 'html': file_summary_html }) updates += 1 else: mod_summary_type = file_summary_type.lower().capitalize() new_summary = Locussummary(locus_id=gene.dbentity_id, summary_type=mod_summary_type, text=file_summary_val, html=file_summary_html, created_by=username, source_id=SGD_SOURCE_ID) nex_session.add(new_summary) inserts += 1 summary = nex_session.query( Locussummary.summary_type, Locussummary.summary_id, Locussummary.html, Locussummary.date_created).filter_by( locus_id=gene.dbentity_id, summary_type=mod_summary_type).all()[0] # add LocussummaryReference(s) if item: if item.get('PMIDs'): pmids = item.get('PMIDs').replace(' ', '') else: pmids = '' if len(pmids) > 0: pmids = re.split('\||,', pmids) for idx, pmid in enumerate(pmids): matching_ref = [ x for x in matching_refs if x.pmid == int(pmid) ][0] summary_id = summary.summary_id reference_id = matching_ref.dbentity_id order = _idx + 1 # look for matching LocussummaryReference matching_locussummary_refs = nex_session.query( LocussummaryReference).filter_by( summary_id=summary_id, reference_id=reference_id).all() if len(matching_locussummary_refs): nex_session.query(LocussummaryReference).filter_by( summary_id=summary_id, reference_id=reference_id).update( {'reference_order': order}) else: new_locussummaryref = LocussummaryReference( summary_id=summary_id, reference_id=reference_id, reference_order=order, source_id=SGD_SOURCE_ID, created_by=username) nex_session.add(new_locussummaryref) # add receipt summary_type_url_segment = file_summary_type.lower() if summary_type_url_segment not in [ 'phenotype', 'regulation', 'interaction', 'sequence', 'disease', 'protein' ]: summary_type_url_segment = '' preview_url = '/locus/' + gene.sgdid + '/' + summary_type_url_segment clear_target_urls.append(preview_url) if summary: summary_obj = { 'display_name': gene.format_name, 'obj_url': preview_url, 'activity_category': summary.summary_type, 'json': json.dumps({ 'summary_data': item, 'modified_date': str(datetime.now()) }), 'created_by': username, 'dbentity_id': gene.dbentity_id } message = 'added' new_curate_activity = CuratorActivity( display_name=summary_obj['display_name'], obj_url=summary_obj['obj_url'], activity_category=summary_obj['activity_category'], dbentity_id=summary_obj['dbentity_id'], message=message, json=summary_obj['json'], created_by=summary_obj['created_by']) nex_session.add(new_curate_activity) receipt_entries.append({ 'category': 'locus', 'href': preview_url, 'name': gene.display_name, 'type': file_summary_type, 'value': file_summary_val }) transaction.commit() nex_session.close() if len(clear_target_urls) > 0: ban_from_cache(clear_target_urls) return {'inserts': inserts, 'updates': updates, 'entries': receipt_entries}
def validate_file_content_and_process(file_content, nex_session, username): header_literal = [ '# Feature', 'Summary Type (phenotype, regulation)', 'Summary', 'PMIDs' ] accepted_summary_types = ['Phenotype', 'Regulation'] file_gene_ids = [] file_pmids = [] copied = [] already_used_genes = [] try: for i, val in enumerate(file_content): # match header if i is 0: is_header_match = header_literal == val if not is_header_match: raise ValueError( 'File header does not match expected format. Please make your file match the template file linked below.' ) else: gene_id = val[0] file_gene_ids.append(gene_id.strip()) gene_id_with_summary = gene_id + val[1] if gene_id_with_summary in already_used_genes: raise ValueError( 'The same gene summary cannot be updated twice in the same file: ' + str(gene_id)) already_used_genes.append(gene_id_with_summary) # match summary types if val[1] not in accepted_summary_types: raise ValueError( 'Unaccepted summary type. Must be one of ' + ', '.join(accepted_summary_types)) # collect PMIDs if len(val) == 4: pmids = val[3].replace(' ', '') if len(pmids): pmids = re.split('\||,', pmids) for d in pmids: file_pmids.append(str(d)) # match length of each row if (len(val) != len(header_literal) and len(val) != len(header_literal) - 1): raise ValueError('Row has incorrect number of columns.') copied.append(val) except IndexError: raise ValueError( 'The file is not a valid TSV with the correct number of columns. Check the file and try again.' ) nex_session.execute('SET LOCAL ROLE ' + username) # check that gene names are valid valid_genes = nex_session.query(Locusdbentity.format_name).filter( Locusdbentity.format_name.in_(file_gene_ids)).all() valid_genes = [str(d[0]) for d in valid_genes] invalid_genes = [d for d in file_gene_ids if d not in valid_genes] if len(invalid_genes): raise ValueError('Invalid gene identifier: ' + ', '.join(invalid_genes)) # must be valid PMIDs in last column or nothing matching_refs = nex_session.query(Referencedbentity).filter( Referencedbentity.pmid.in_(file_pmids)).all() temp_matching_refs = [str(d.pmid) for d in matching_refs] invalid_refs = [d for d in file_pmids if d not in temp_matching_refs] if len(invalid_refs): raise ValueError('Invalid PMID: ' + ', '.join(invalid_refs) + '. Must be a pipe-separated list of PMIDs from SGD.') # update receipt_entries = [] locus_names_ids = nex_session.query(Locusdbentity.display_name, Locusdbentity.sgdid).all() inserts = 0 updates = 0 for i, val in enumerate(copied): if i != 0: file_id = val[0] file_summary_type = val[1] file_summary_val = val[2] file_summy_html = link_gene_names(file_summary_val, locus_names_ids) gene = nex_session.query(Locusdbentity).filter_by( format_name=file_id).one_or_none() summaries = nex_session.query( Locussummary.summary_type, Locussummary.summary_id, Locussummary.html, Locussummary.date_created).filter_by( locus_id=gene.dbentity_id, summary_type=file_summary_type).all() # update summary = None if len(summaries): summary = summaries[0] nex_session.query(Locussummary).filter_by( summary_id=summary.summary_id).update({ 'text': file_summary_val, 'html': file_summy_html }) updates += 1 else: new_summary = Locussummary(locus_id=gene.dbentity_id, summary_type=file_summary_type, text=file_summary_val, html=file_summy_html, created_by=username, source_id=SGD_SOURCE_ID) nex_session.add(new_summary) inserts += 1 summary = nex_session.query( Locussummary.summary_type, Locussummary.summary_id, Locussummary.html, Locussummary.date_created).filter_by( locus_id=gene.dbentity_id, summary_type=file_summary_type).all()[0] # add LocussummaryReference(s) if len(val) == 4: pmids = val[3].replace(' ', '') if len(pmids): pmids = re.split('\||,', pmids) for _i, p in enumerate(pmids): matching_ref = [ x for x in matching_refs if x.pmid == int(p) ][0] summary_id = summary.summary_id reference_id = matching_ref.dbentity_id order = _i + 1 # look for matching LocussummaryReference matching_locussummary_refs = nex_session.query( LocussummaryReference).filter_by( summary_id=summary_id, reference_id=reference_id).all() if len(matching_locussummary_refs): nex_session.query(LocussummaryReference).filter_by( summary_id=summary_id, reference_id=reference_id).update( {'reference_order': order}) else: new_locussummaryref = LocussummaryReference( summary_id=summary_id, reference_id=reference_id, reference_order=order, source_id=SGD_SOURCE_ID, created_by=username) nex_session.add(new_locussummaryref) # add receipt summary_type_url_segment = file_summary_type.lower() if summary_type_url_segment not in ['phenotype', 'regulation']: summary_type_url_segment = '' preview_url = '/locus/' + gene.sgdid + '/' + summary_type_url_segment receipt_entries.append({ 'category': 'locus', 'href': preview_url, 'name': gene.display_name, 'type': file_summary_type, 'value': file_summary_val }) transaction.commit() nex_session.close() return {'inserts': inserts, 'updates': updates, 'entries': receipt_entries}