def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) #print("Calculating md5sum.") #md5sum = hashlib.md5() # with open(record['local_url'], 'rb') as f: # for chunk in iter(lambda: f.read(1024*1024), ''): # md5sum.update(chunk) node.study = 'prediabetes' node.comment = str(record['sample_name_id']) + '.hostseqprep' node.prepared_by = 'Varsha Rao and Reza Sailani' node.sequencing_contact = 'Varsha Rao and Reza Sailani' node.sequencing_center = 'Stanford University' node.format = 'fastq' node.format_doc = 'https://en.wikipedia.org/wiki/' + str(node.format) node.exp_length = 0 #record['exp_length'] node.local_file = str(record['sample_name_id']) + '.hostseqprep' node.storage_duration = int('1') # node.checksums = {'md5': md5sum.hexdigest(), 'sha256':record['sha256']} # node.size = int(record['size']) node.tags = list_tags(node.tags, # 'test', # for debug!! 'sample name: '+record['visit_id'], 'visit id: '+record['visit_id'], 'subject id: '+record['rand_subject_id'], 'file prefix: '+ record['prep_id'], 'file name: '+ str(record['sample_name_id']) + '.hostseqprep', ) node.lib_layout = record['lib_layout'] node.lib_selection = record['lib_selection'] node.ncbi_taxon_id = '9606' node.prep_id = record['prep_id'] parent_link = {'prepared_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name, fieldnames=csv_fieldnames) local_file_name = os.path.basename(record['local_file']) node.comment = local_file_name node.study = 'prediabetes' node.sequence_type = 'nucleotide' node.seq_model = record['seq_model'] node.format = 'fastq' node.format_doc = 'https://en.wikipedia.org/wiki/FASTQ_format' node.exp_length = 0 # record['exp_length'] # node.local_file = record['local_file'] if record['consented'] == 'YES' else '' if record['consented'] == 'YES': node.local_file = record['local_file'] node.checksums = {'md5': record['md5'], 'sha256': record['sha256']} node.size = int(record['size']) else: node.private_files = True node.checksums = {'md5': '00000000000000000000000000000000'} node.size = 0 node.tags = list_tags( 'sequence type: ' + 'RNAseq', 'jaxid (sample): ' + record['jaxid_sample'], 'sample name: ' + record['sample_name_id'], 'body site: ' + record['body_site'], 'subject id: ' + record['rand_subject_id'], 'study: ' + 'prediabetes', 'prep_id:' + record['prep_id'], 'file name: ' + local_file_name, ) parent_link = {'sequenced_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) local_file_name = os.path.basename(record['local_file_clean']) node.study = 'prediabetes' node.comment = local_file_name node.format = record['format'] # only 'fasta', 'fastq' allowed! node.format_doc = 'https://en.wikipedia.org/wiki/' +\ record['format'].upper() + '_format' if record['consented'] == 'YES': node.local_file = record['local_file_clean'] node.checksums = {'md5':record['clean_md5'], 'sha256':record['clean_sha256']} node.size = int(record['clean_size']) else: node.private_files = True node.checksums = {'md5': '00000000000000000000000000000000'} node.size = 1 node.tags = list_tags( 'study: prediabetes', 'subject id: '+record['rand_subject_id'], 'sample name: '+record['sample_name_id'], 'body site: '+record['body_site'], 'prep_id:' + record['prep_id'], 'raw_file_id: '+ record['raw_file_id'], ) log.debug('parent_id: '+str(parent_id)) node.links = {'computed_from':[parent_id]} if not node.is_valid(): invalidities = str(node.validate()) err_str = "Invalid node {}!\t\t{}".format(node_type, invalidities) log.error(err_str) # vals = [record] # vals.append(invalidities) write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False elif node.save(): log.info('node saved: '+str(node.comment)) write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: log.info('node NOT saved: '+str(node.comment)) write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def submit(data_file, id_tracking_file=node_tracking_file): log.info('Starting submission of %ss.', node_type) nodes = [] csv_fieldnames = get_field_header(data_file) write_csv_headers(data_file,fieldnames=csv_fieldnames) for record in load_data(data_file): # if record['consented'] == 'YES' \ # and record['visit_number'] != 'UNK': #if record['visit_number'] != 'UNK': # use of 'UNK' = hack workaround for unreconciled visit list log.info('\n...next record...') try: log.debug('data record: '+str(record)) # node-specific variables: load_search_field = 'visit_id' internal_id = record['DCC_VISIT_IDS'] parent_internal_id = record['rand_patient_id'] ##Text ID to find the parent and get back OSDF ID grand_parent_internal_id = 'prediabetes' parent_id = get_parent_node_id( id_tracking_file, parent_type, parent_internal_id) # grand_parent_id = get_parent_node_id( # id_tracking_file, grand_parent_type, grand_parent_internal_id) if parent_id: node_is_new = False # set to True if newbie node = load(internal_id, load_search_field) if not getattr(node, load_search_field): log.debug('loaded node newbie...') node_is_new = True saved = validate_record(parent_id, node, record, data_file_name=data_file) if saved: header = settings.node_id_tracking.id_fields saved_name = getattr(saved, load_search_field) vals = values_to_node_dict( [[node_type.lower(), saved_name, saved.id, parent_type.lower(), parent_internal_id, parent_id, get_cur_datetime()]], header ) nodes.append(vals) if node_is_new: write_out_csv(id_tracking_file, fieldnames=get_field_header(id_tracking_file), values=vals) else: log.error('No parent_id found for %s', parent_internal_id) except Exception, e: log.exception(e) raise e
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name, fieldnames=csv_fieldnames) node.comment = record['prep_id'] node.frag_size = 301 # goal size node.lib_layout = 'paired 301bp' node.lib_selection = '' node.mimarks = generate_mimarks(record) node.ncbi_taxon_id = '408170' \ if 'stool' == record['body_site'] \ else '1131769' # nasal # ST: http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=408170 # NS: http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=1131769 node.prep_id = record['prep_id'] node.sequencing_center = 'Jackson Laboratory for Genomic Medicine' node.sequencing_contact = 'George Weinstock' node.storage_duration = 2112 node.tags = list_tags(node.tags, # 'test', # for debug!! 'jaxid (sample): '+record['jaxid_sample'], 'jaxid (library): '+record['jaxid_library'] \ if record['jaxid_library'] \ else 'jaxid (library): unknown', 'visit id: '+record['visit_id'], 'subject id: '+record['rand_subject_id'], 'study: prediabetes', 'file prefix: '+ record['prep_id'], ) parent_link = {'prepared_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def submit(data_file, id_tracking_file=node_tracking_file): log.info('Starting submission of %ss.', node_type) nodes = [] csv_fieldnames = get_field_header(data_file) write_csv_headers(data_file,fieldnames=csv_fieldnames) for record in load_data(data_file): log.info('...next record...') try: log.debug('data record: '+str(record)) # node-specific variables: load_search_field = 'local_file' internal_id = os.path.basename(record[load_search_field]) parent_internal_id = record['prep_id'] grand_parent_internal_id = record['visit_id'] parent_id = get_parent_node_id( id_tracking_file, parent_type, parent_internal_id) log.debug('matched parent_id: %s', parent_id) if parent_id: node_is_new = False # set to True if newbie node = load(internal_id, load_search_field) if not getattr(node, load_search_field): log.debug('loaded node newbie...') node_is_new = True saved = validate_record(parent_id, node, record, data_file_name=data_file) if saved: # load_search_field = 'urls' header = settings.node_id_tracking.id_fields if record['consented'] == 'YES': saved_name = os.path.basename(getattr(saved, load_search_field)) else: saved_name = '-'.join([getattr(saved, 'comment'), 'private_file']) vals = values_to_node_dict( [[node_type.lower(), saved_name, saved.id, parent_type.lower(), parent_internal_id, parent_id, get_cur_datetime()]], header ) nodes.append(vals) if node_is_new: write_out_csv(id_tracking_file, fieldnames=get_field_header(id_tracking_file), values=vals) else: log.error('No parent_id found for %s', parent_internal_id) except Exception, e: log.exception(e) raise e
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.study = 'prediabetes' node.comment = record['local_uri'] node.prepared_by = record['sequencing_contact'] node.sequence_type = 'nucleotide' node.format = 'fastq' node.format_doc = 'https://en.wikipedia.org/wiki/' node.exp_length = 0 #record['exp_length'] node.local_file = [record['local_uri']] # node.checksums = {'md5':record['md5'], 'sha256':record['sha256']} # node.size = int(record['size']) node.tags = list_tags(node.tags, # 'test', # for debug!! 'sample name: '+record['visit_id'], 'visit id: '+record['visit_id'], 'subject id: '+record['rand_subject_id'], 'study: prediabetes', 'file prefix: '+ record['prep_id'], 'file name: '+ record['local_uri'], ) node.lib_layout = record['lib_layout'] node.lib_selection = record['lib_selection'] node.ncbi_taxon_id = record['ncbi_taxon_id'] node.prep_id = record['prep_id'] parent_link = {'sequenced_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.study = 'prediabetes' node.comment = record['prep_id'] + ' ... Quality trimmed, cleaned, '\ + 'dehosted, converted fastq to fasta.' node.format = record['format'] # only 'fasta', 'fastq' allowed! node.format_doc = 'https://en.wikipedia.org/wiki/' +\ record['format'].upper() + '_format' node.local_file = record['local_file'] node.size = int(record['size']) node.checksums = {'md5':record['md5'], 'sha256':record['sha256']} node.tags = list_tags(node.tags, # 'test', # for debug!! 'jaxid (sample): '+record['jaxid_sample'], 'jaxid (library): '+record['jaxid_library'] \ if record['jaxid_library'] \ else 'jaxid (library): none', 'sample name: '+record['sample_name_id'], 'body site: '+record['body_site'], 'visit id: '+record['visit_id'], 'subject id: '+record['rand_subject_id'], 'study: prediabetes', 'dna_prep_id: '+ record['prep_id'], 'raw_file_id: '+ record['raw_file_id'], ) log.debug('parent_id: '+str(parent_id)) node.links = {'computed_from':[parent_id]} csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def submit(data_file, id_tracking_file=node_tracking_file): log.info('Starting submission of %ss.', node_type) nodes = [] csv_fieldnames = get_field_header(data_file) write_csv_headers(data_file,fieldnames=csv_fieldnames) for record in load_data(data_file): log.info('...next record...') try: log.debug('data record: '+str(record)) # node-specific variables: load_search_field = 'comment' internal_id = str(record['host_transcriptomics_id']) +'.host_transcriptomics' parent_internal_id = record['host_seq_prep_name_id'] ##Link to Host_seq_prep ID grand_parent_internal_id = record['sample_name_id'] ##Link to Sample ID parent_id = get_parent_node_id( id_tracking_file, parent_type, parent_internal_id) log.debug('matched parent_id: %s', parent_id) if parent_id: node_is_new = False # set to True if newbie node = load(internal_id, load_search_field) if not getattr(node, load_search_field): log.debug('loaded node newbie...') node_is_new = True import pdb ; pdb.set_trace() saved = validate_record(parent_id, node, record, data_file_name=data_file) if saved: header = settings.node_id_tracking.id_fields saved_name = getattr(saved, load_search_field) vals = values_to_node_dict( [[node_type.lower(),saved_name,saved.id, parent_type.lower(),parent_internal_id,parent_id]], header ) nodes.append(vals) if node_is_new: write_out_csv(id_tracking_file, fieldnames=get_field_header(id_tracking_file), values=vals) else: log.error('No parent_id found for %s', parent_internal_id) except Exception, e: log.exception(e) raise e
def submit(data_file, id_tracking_file=node_tracking_file): log.info('Starting submission of %ss.', node_type) nodes = [] csv_fieldnames = get_field_header(data_file) write_csv_headers(data_file, fieldnames=csv_fieldnames) for record in load_data(data_file): # check not 'unknown' jaxid, not missing visit info if len(record['visit_id']) > 0: log.debug('\n...next record...') try: log.debug('data record: '+str(record)) # Node-Specific Variables: load_search_field = 'name' internal_id = record['sample_name_id'] parent_internal_id = record['visit_id'] grand_parent_internal_id = record['rand_subject_id'] parent_id = get_parent_node_id( id_tracking_file, parent_type, parent_internal_id) node_is_new = False # set to True if newbie node = load(internal_id, load_search_field) if not getattr(node, load_search_field): log.debug('loaded node newbie...') node_is_new = True saved = validate_record(parent_id, node, record, data_file_name=data_file) if saved: header = settings.node_id_tracking.id_fields saved_name = getattr(saved, load_search_field) vals = values_to_node_dict( [[node_type.lower(), saved_name, saved.id, parent_type.lower(), parent_internal_id, parent_id]], header ) nodes.append(vals) if node_is_new: write_out_csv(id_tracking_file, fieldnames=get_field_header(id_tracking_file), values=vals) except Exception, e: log.exception(e) raise e else: write_out_csv(data_file+'_records_no_submit.csv', fieldnames=record.keys(), values=[record,])
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.study = 'prediabetes' node.comment = record['host_transcriptomics_id'] node.sequence_type = 'nucleotide' node.seq_model = ' ' node.format = 'fastq' node.format_doc = 'https://en.wikipedia.org/wiki/FASTQ_format' node.exp_length = 0 #record['exp_length'] node.urls = {record['local_file_path']} node.checksums = {'md5':record['md5'], 'sha256':record['sha']} node.size = int(record['SIZE']) #node.tags = list_tags(node.tags, # 'sample name: ' + record['visit_id'], # 'body site: ' + record['body_site'], # 'visit id: ' + record['visit_id'], # 'subject id: ' + record['rand_subject_id'], # 'file prefix: ' + record['sample_name_id'] + '.hostseqprep', # 'file name: ' + record['local_file'], # 'sub-group: ' + record['subtype'], # ) parent_link = {'sequenced_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.study = 'prediabetes' node.comment = record['sample_name_id'] + ".metabolome" node.format = record['format'] ## FIX TO HANDLE mzXML files node.format_doc = 'https://en.wikipedia.org/wiki/Mass_spectrometry_data_format' node.subtype = 'host' node.checksums = {'md5':record['MD5Sum'], 'sha256':record['SHA256']} node.local_file = record['url'] #node.tags = list_tags(node.tags, #'sample name: '+ record['sample_name_id'] + ".metabolome", #'visit id: '+ record['visit_id'], #'subject id: '+ record['rand_subject_id'], #'sample fluid type: ' + 'Plasma', #'type: ' + record['Type'], #'batch: ' + record['BATCH'], #'mode: ' + record['MODE'] #) import pdb ; pdb.set_trace() log.debug('parent_id: '+str(parent_id)) node.links = {'derived_from':[parent_id]} csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def submit(data_file, id_tracking_file=node_tracking_file): log.info('Starting submission of %ss.', node_type) nodes = [] csv_fieldnames = get_field_header(data_file) write_csv_headers(data_file,fieldnames=csv_fieldnames) for record in load_data(data_file): log.info('\n...next record...') try: log.debug('data record: '+str(record)) # node-specific variables: load_search_field = 'comment' internal_id = record['sample_name_id'] + '.proteome' parent_internal_id = record['sample_name_id'] + '.hostassayprep' grand_parent_internal_id = record['visit_id'] parent_id = get_parent_node_id( id_tracking_file, parent_type, parent_internal_id) node_is_new = False # set to True if newbie node = load(internal_id, load_search_field) if not getattr(node, load_search_field): log.debug('loaded node newbie...') node_is_new = True import pdb ; pdb.set_trace() saved = validate_record(parent_id, node, record, data_file_name=data_file) if saved: header = settings.node_id_tracking.id_fields saved_name = getattr(saved, load_search_field) vals = values_to_node_dict( [[node_type.lower(),saved_name,saved.id, parent_type.lower(),parent_internal_id,parent_id]], header ) nodes.append(vals) if node_is_new: write_out_csv(id_tracking_file, fieldnames=get_field_header(id_tracking_file), values=vals) except Exception, e: log.exception(e) raise e
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name, fieldnames=csv_fieldnames) node.name = record['sample_name_id'] node.body_site = record['body_site'].lower() fma_body_site = record['fma_body_site'] node.fma_body_site = fma_body_site node.mixs = generate_mixs(record) node.tags = list_tags(node.tags, # 'test', # for debug!! 'stanford_id: ' + record['sample_name_id'], 'visit id: ' +record['visit_id'], 'subject id: ' +record['rand_subject_id'], 'study: ' +'prediabetes', 'sub_study: ' +record['sub_study'], 'visit type: ' +record['visit_type'] ) # node._attribs = record['attributes'] parent_link = {'collected_during':[parent_id]} log.debug('parent_id: '+str(parent_id)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def submit(data_file, id_tracking_file=node_tracking_file): log.info('Starting submission of %ss.', node_type) nodes = [] csv_fieldnames = get_field_header(data_file) write_csv_headers(data_file,fieldnames=csv_fieldnames) for record in load_data(data_file): log.info('\n...next record...') try: log.debug('data record: '+str(record)) if record['local_file'] != '': load_search_field = 'local_file' internal_id = os.path.basename(record['local_file']) parent_internal_id = record['raw_file_id'] grand_parent_internal_id = record['prep_id'] parent_id = get_parent_node_id( id_tracking_file, parent_type, parent_internal_id) node_is_new = False # set to True if newbie node = load(internal_id, load_search_field) if not getattr(node, load_search_field): log.debug('loaded node newbie...') node_is_new = True saved = validate_record(parent_id, node, record, data_file_name=data_file) if saved: header = settings.node_id_tracking.id_fields vals = values_to_node_dict( [[node_type.lower(),saved_name,saved.id, parent_type.lower(),parent_internal_id,parent_id]], header ) nodes.append(vals) if node_is_new: write_out_csv(id_tracking_file, fieldnames=get_field_header(id_tracking_file), values=vals) except Exception, e: log.exception(e) raise e
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.visit_id = record['visit_id'] node.visit_number = int(record['visit_number']) node.interval = int(record['interval']) node.tags = list_tags(node.tags, # 'test', # for debug!! 'rand_subject_id: '+record['rand_subject_id'], 'study: prediabetes', # 'study: '+record['study'], # 'sub_study: '+record['sub_study'], ) log.debug('parent_id: '+str(parent_id)) node.links = {'by':[parent_id]} csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames,values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames,values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames,values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) node.name = record['sample_name_id'] node.body_site = record['body_site'].lower() node.fma_body_site = record['fma_body_site'] node.mixs = generate_mixs(record) node.tags = list_tags( 'sample id: ' + record['sample_name_id'], 'visit id: ' + record['DCC_VISIT_IDS'], 'subject id: ' + record['rand_patient_id'], 'study: prediabetes', #'consented: ' + record['consented'], ) # node._attribs = record['attributes'] parent_link = {'collected_during':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link if not node.is_valid(): invalids = data_file_name[:-4]+'.invalid_records.csv' write_csv_headers(invalids, fieldnames=csv_fieldnames) write_out_csv(invalids, fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): submitted = data_file_name[:-4]+'.submitted.csv' write_csv_headers(submitted, fieldnames=csv_fieldnames) write_out_csv(submitted, fieldnames=csv_fieldnames, values=[record,]) return node else: unsaved = data_file_name[:-4]+'.unsaved.csv' write_csv_headers(unsaved, fieldnames=csv_fieldnames) write_out_csv(unsaved, fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) node.study = 'prediabetes' node.subtype = 'prediabetes' node.tags = list_tags('Race: ' + get_race(record['race_code']), 'age: ' + record['age'], 'gender: ' + get_gender(record['gender']), ) parent_link = {'associated_with':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link if not node.is_valid(): invalids = data_file_name[:-4]+'.invalid_records.csv' write_csv_headers(invalids, fieldnames=csv_fieldnames) write_out_csv(invalids, fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): submitted = data_file_name[:-4]+'.submitted.csv' write_csv_headers(submitted, fieldnames=csv_fieldnames) write_out_csv(submitted, fieldnames=csv_fieldnames, values=[record,]) return node else: unsaved = data_file_name[:-4]+'.unsaved.csv' write_csv_headers(unsaved, fieldnames=csv_fieldnames) write_out_csv(unsaved, fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) #print("Calculating md5sum.") #md5sum = hashlib.md5() # with open(record['local_url'], 'rb') as f: # for chunk in iter(lambda: f.read(1024*1024), ''): # md5sum.update(chunk) node.study = 'prediabetes' node.comment = record['host_assay_prep_id'] node.sample_name = record['sample_name_id'] node.contact = 'Wenyu Zhou' node.center = 'Stanford' node.format = 'csv' node.format_doc = 'https://en.wikipedia.org/wiki/csv' node.exp_length = 0 #record['exp_length'] #node.local_file = record['DCC_File_Path'] node.experiment_type = record['experiment_type'] node.title = 'T2D Prediabetes Proteomics' node.prep_id = record['rand_patient_id'] node.pride_id = 'null' #node.checksums = {'md5': md5.hexdigest(), 'sha256':record['sha256']} node.storage_duration = 0 #node.size = int(record['FileSize']) node.protocol_steps = '' node.species = 'H**o sapiens (Human)' node.subtype = '' node.tissue = 'blood' #node.tags = list_tags(node.tags, # # 'test', # for debug!! # 'visit id: ' + record['visit_id'], # 'subject id: ' + record['rand_subject_id'], #) parent_link = {'prepared_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false""" csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) #node.prepared_by = 'Sara Ahadi' #node.sequence_type = 'nucleotide' #node.format = 'mzml' #node.format_doc = 'https://en.wikipedia.org/wiki/' #node.exp_length = 0 #record['exp_length'] node.raw_url.append(record['DCC_File_Path']) #node.size = int(record['size']) #node.tags = list_tags(node.tags, # 'test', # for debug!! #'sample name: '+record['sample_name_id'], #'visit id: '+record['visit_id'], #'subject id: '+record['rand_subject_id'], #'file name: '+ record['sample_name_id'] + 'mxML.gz', # ) #node.analyzer = 'TOF(Time of Flight)' node.comment = record['sample_name_id'] node.checksums = {'md5':record['MD5Sum'], 'sha256':record['SHA256']} node.data_processing_protocol = 'Targeted Data Independent analysis' #node.detector = '' #node.exp_description = 'Protein profiling of more than 900 samples from pre-diabetic and diabetic participants and different time points of healthy, viral infection and immunization.' #node.instrument_name = '' #node.pepid_url.append(record['DCC_File_Path']) #node.pepid_url.remove('') #node.pride_id = '' node.processing_method = 'Targeted Data independent analysis with OpenSwath' #node.peak_url = '' node.protocol_name = 'SWATH_Proteomics (attached)' node.sample_name = 'Plasma' node.search_engine = 'ProteinPilot Paragon database search algorithm' #node.short_label = '' #node.software = 'ProteinPilot 5.0.1, OpenSwath, PyProphet, TRIC' node.source = 'DuoSpray Ion Source' node.subtype = 'host' #node.raw_url = '' #node.result_url = '' #node.other_url = '' node.study = 'prediabetes' #node.tags = () node.title = 'T2D Prediabetes' #Targeted Immunoproteomics parent_link = {'derived_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) #print("Calculating md5sum.") #md5sum = hashlib.md5() # with open(record['local_url'], 'rb') as f: # for chunk in iter(lambda: f.read(1024*1024), ''): # md5sum.update(chunk) node.study = 'prediabetes' node.comment = record['sample_name_id'] + '.cytokine' node.checksums = {'md5':record['md5'], 'sha256':record['sha256']} node.subtype = 'prediabetes' # node.urls = record['local_file'] node.format_doc = 'https://en.wikipedia.org/wiki/Tab-separated_values' node.format = 'tsv' # node.sample_name = record['SAMPLE_PARENT_ID'] # node.contact = record['sequencing_contact'] # node.center = record['center'] # node.format = 'mzXML' # node.format_doc = 'https://en.wikipedia.org/wiki/Mass_spectrometry_data_format' # node.exp_length = 0 #record['exp_length'] node.local_file = record['FileLocation'] # node.experiment_type = 'Untargeted metabolomics' # node.title = record['title'] # node.prep_id = record['prep_id'] # node.pride_id = 'null' # node.storage_duration = 0 # node.size = int(record['size']) # node.tags = list_tags(node.tags, # # 'test', # for debug!! # 'visit id: '+record['visit_id'], # 'subject id: '+record['rand_subject_id'], # 'file prefix: '+ record['prep_id'], # 'file name: '+ str(record['FILE_NAME']), # ) parent_link = {'derived_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) node.study = 'prediabetes' node.subtype = 'prediabetes' node.tags = list_tags('') parent_link = {'associated_with':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link if not node.is_valid(): invalids = data_file_name[:-4]+'.invalid_records.csv' write_csv_headers(invalids, fieldnames=csv_fieldnames) write_out_csv(invalids, fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): submitted = data_file_name[:-4]+'.submitted.csv' write_csv_headers(submitted, fieldnames=csv_fieldnames) write_out_csv(submitted, fieldnames=csv_fieldnames, values=[record,]) return node else: unsaved = data_file_name[:-4]+'.unsaved.csv' write_csv_headers(unsaved, fieldnames=csv_fieldnames)