def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    log.info("in validate/save: "+node_type)
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    #print("Calculating md5sum.")
    #md5sum = hashlib.md5()
    #    with open(record['local_url'], 'rb') as f:
    #    for chunk in iter(lambda: f.read(1024*1024), ''):
    #        md5sum.update(chunk)

    node.study         = 'prediabetes'
    node.comment       = str(record['sample_name_id']) + '.hostseqprep'
    node.prepared_by   = 'Varsha Rao and Reza Sailani'
    node.sequencing_contact = 'Varsha Rao and Reza Sailani'
    node.sequencing_center = 'Stanford University'
    node.format        = 'fastq'
    node.format_doc    = 'https://en.wikipedia.org/wiki/' + str(node.format)
    node.exp_length    = 0 #record['exp_length']
    node.local_file    = str(record['sample_name_id']) + '.hostseqprep'
    node.storage_duration = int('1')
#    node.checksums     = {'md5': md5sum.hexdigest(), 'sha256':record['sha256']}
#    node.size          = int(record['size'])
    node.tags = list_tags(node.tags,
                          # 'test', # for debug!!
                          'sample name: '+record['visit_id'],
                          'visit id: '+record['visit_id'],
                          'subject id: '+record['rand_subject_id'],
                          'file prefix: '+ record['prep_id'],
                          'file name: '+ str(record['sample_name_id']) + '.hostseqprep',
                         )
    node.lib_layout     = record['lib_layout']
    node.lib_selection  = record['lib_selection']
    node.ncbi_taxon_id  = '9606'
    node.prep_id        = record['prep_id']

    parent_link = {'prepared_from':[parent_id]}
    log.debug('parent_id: '+str(parent_link))
    node.links = parent_link

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    log.info("in validate/save: "+node_type)
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name, fieldnames=csv_fieldnames)

    local_file_name    = os.path.basename(record['local_file'])
    node.comment       = local_file_name
    node.study         = 'prediabetes'
    node.sequence_type = 'nucleotide'
    node.seq_model     = record['seq_model']
    node.format        = 'fastq'
    node.format_doc    = 'https://en.wikipedia.org/wiki/FASTQ_format'
    node.exp_length    = 0  # record['exp_length']
    # node.local_file    = record['local_file'] if record['consented'] == 'YES' else ''
    if record['consented'] == 'YES':
        node.local_file = record['local_file']
        node.checksums     = {'md5': record['md5'], 'sha256': record['sha256']}
        node.size          = int(record['size'])
    else:
        node.private_files = True
        node.checksums     = {'md5': '00000000000000000000000000000000'}
        node.size          = 0
    node.tags = list_tags(
                          'sequence type: '   + 'RNAseq',
                          'jaxid (sample): '  + record['jaxid_sample'],
                          'sample name: '     + record['sample_name_id'],
                          'body site: '       + record['body_site'],
                          'subject id: '      + record['rand_subject_id'],
                          'study: '           + 'prediabetes',
                          'prep_id:'          + record['prep_id'],
                          'file name: '       + local_file_name,
                         )
    parent_link = {'sequenced_from':[parent_id]}
    log.debug('parent_id: '+str(parent_link))
    node.links = parent_link

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
Пример #3
0
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    local_file_name = os.path.basename(record['local_file_clean'])
    node.study         = 'prediabetes'
    node.comment       = local_file_name
    node.format        = record['format'] # only 'fasta', 'fastq' allowed!
    node.format_doc    = 'https://en.wikipedia.org/wiki/' +\
                         record['format'].upper() + '_format'
    if record['consented'] == 'YES':
        node.local_file = record['local_file_clean']
        node.checksums  = {'md5':record['clean_md5'], 'sha256':record['clean_sha256']}
        node.size       = int(record['clean_size'])
    else:
        node.private_files = True
        node.checksums     = {'md5': '00000000000000000000000000000000'}
        node.size          = 1
    node.tags = list_tags(
                          'study: prediabetes',
                          'subject id: '+record['rand_subject_id'],
                          'sample name: '+record['sample_name_id'],
                          'body site: '+record['body_site'],
                          'prep_id:' + record['prep_id'],
                          'raw_file_id: '+ record['raw_file_id'],
                         )

    log.debug('parent_id: '+str(parent_id))
    node.links = {'computed_from':[parent_id]}

    if not node.is_valid():
        invalidities = str(node.validate())
        err_str = "Invalid node {}!\t\t{}".format(node_type, invalidities)
        log.error(err_str)
        # vals = [record]
        # vals.append(invalidities)
        write_out_csv(data_file_name+'_invalid_records.csv',
                fieldnames=csv_fieldnames,
                values=[record,])
        return False
    elif node.save():
        log.info('node saved: '+str(node.comment))
        write_out_csv(data_file_name+'_submitted.csv',
                fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        log.info('node NOT saved: '+str(node.comment))
        write_out_csv(data_file_name+'_unsaved_records.csv',
                fieldnames=csv_fieldnames, values=[record,])
        return False
Пример #4
0
def submit(data_file, id_tracking_file=node_tracking_file):
	log.info('Starting submission of %ss.', node_type)
	nodes = []
	csv_fieldnames = get_field_header(data_file)
	write_csv_headers(data_file,fieldnames=csv_fieldnames)
	for record in load_data(data_file):
		# if record['consented'] == 'YES' \
		# and record['visit_number'] != 'UNK':
		#if record['visit_number'] != 'UNK':
			# use of 'UNK' = hack workaround for unreconciled visit list
		log.info('\n...next record...')
		try:
			log.debug('data record: '+str(record))

			# node-specific variables:
			load_search_field = 'visit_id'
			internal_id = record['DCC_VISIT_IDS']
			parent_internal_id = record['rand_patient_id']  ##Text ID to find the parent and get back OSDF ID
			grand_parent_internal_id = 'prediabetes'

			parent_id = get_parent_node_id(
				id_tracking_file, parent_type, parent_internal_id)
			# grand_parent_id = get_parent_node_id(
				# id_tracking_file, grand_parent_type, grand_parent_internal_id)

			if parent_id:
				node_is_new = False # set to True if newbie
				node = load(internal_id, load_search_field)
				if not getattr(node, load_search_field):
					log.debug('loaded node newbie...')
					node_is_new = True

				saved = validate_record(parent_id, node, record,
										data_file_name=data_file)
				if saved:
					header = settings.node_id_tracking.id_fields
					saved_name = getattr(saved, load_search_field)
					vals = values_to_node_dict(
						[[node_type.lower(), saved_name, saved.id,
						  parent_type.lower(), parent_internal_id, parent_id,
						  get_cur_datetime()]],
						header
						)
					nodes.append(vals)
					if node_is_new:
						write_out_csv(id_tracking_file,
							  fieldnames=get_field_header(id_tracking_file),
							  values=vals)
			else:
				log.error('No parent_id found for %s', parent_internal_id)

		except Exception, e:
			log.exception(e)
			raise e
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    log.info("in validate/save: "+node_type)
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name, fieldnames=csv_fieldnames)

    node.comment = record['prep_id']
    node.frag_size = 301 # goal size
    node.lib_layout = 'paired 301bp'
    node.lib_selection = ''
    node.mimarks = generate_mimarks(record)
    node.ncbi_taxon_id = '408170' \
            if 'stool' == record['body_site'] \
            else '1131769' # nasal
            # ST: http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=408170
            # NS: http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=1131769
    node.prep_id = record['prep_id']
    node.sequencing_center = 'Jackson Laboratory for Genomic Medicine'
    node.sequencing_contact = 'George Weinstock'
    node.storage_duration = 2112
    node.tags = list_tags(node.tags,
                          # 'test', # for debug!!
                          'jaxid (sample): '+record['jaxid_sample'],
                          'jaxid (library): '+record['jaxid_library'] \
                                          if record['jaxid_library'] \
                                          else 'jaxid (library): unknown',
                          'visit id: '+record['visit_id'],
                          'subject id: '+record['rand_subject_id'],
                          'study: prediabetes',
                          'file prefix: '+ record['prep_id'],
                         )
    parent_link = {'prepared_from':[parent_id]}
    log.debug('parent_id: '+str(parent_link))
    node.links = parent_link

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
def submit(data_file, id_tracking_file=node_tracking_file):
    log.info('Starting submission of %ss.', node_type)
    nodes = []
    csv_fieldnames = get_field_header(data_file)
    write_csv_headers(data_file,fieldnames=csv_fieldnames)
    for record in load_data(data_file):
        log.info('...next record...')
        try:
            log.debug('data record: '+str(record))

            # node-specific variables:
            load_search_field = 'local_file'
            internal_id = os.path.basename(record[load_search_field])
            parent_internal_id = record['prep_id']
            grand_parent_internal_id = record['visit_id']

            parent_id = get_parent_node_id(
                id_tracking_file, parent_type, parent_internal_id)
            log.debug('matched parent_id: %s', parent_id)

            if parent_id:
                node_is_new = False # set to True if newbie
                node = load(internal_id, load_search_field)
                if not getattr(node, load_search_field):
                    log.debug('loaded node newbie...')
                    node_is_new = True

                saved = validate_record(parent_id, node, record,
                                        data_file_name=data_file)
                if saved:
                    # load_search_field = 'urls'
                    header = settings.node_id_tracking.id_fields
                    if record['consented'] == 'YES':
                        saved_name = os.path.basename(getattr(saved, load_search_field))
                    else:
                        saved_name = '-'.join([getattr(saved, 'comment'), 'private_file'])
                    vals = values_to_node_dict(
                        [[node_type.lower(), saved_name, saved.id,
                          parent_type.lower(), parent_internal_id, parent_id,
                          get_cur_datetime()]],
                        header
                        )
                    nodes.append(vals)
                    if node_is_new:
                        write_out_csv(id_tracking_file,
                              fieldnames=get_field_header(id_tracking_file),
                              values=vals)
            else:
                log.error('No parent_id found for %s', parent_internal_id)

        except Exception, e:
            log.exception(e)
            raise e
Пример #7
0
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    log.info("in validate/save: "+node_type)
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    node.study         = 'prediabetes'
    node.comment       = record['local_uri']
    node.prepared_by   = record['sequencing_contact']
    node.sequence_type = 'nucleotide'
    node.format        = 'fastq'
    node.format_doc    = 'https://en.wikipedia.org/wiki/'
    node.exp_length    = 0 #record['exp_length']
    node.local_file    = [record['local_uri']]
#    node.checksums     = {'md5':record['md5'], 'sha256':record['sha256']}
#    node.size          = int(record['size'])
    node.tags = list_tags(node.tags,
                          # 'test', # for debug!!
                          'sample name: '+record['visit_id'],
                          'visit id: '+record['visit_id'],
                          'subject id: '+record['rand_subject_id'],
                          'study: prediabetes',
                          'file prefix: '+ record['prep_id'],
                          'file name: '+ record['local_uri'],
                         )
    node.lib_layout     = record['lib_layout']
    node.lib_selection  = record['lib_selection']
    node.ncbi_taxon_id  = record['ncbi_taxon_id']
    node.prep_id        = record['prep_id']

    parent_link = {'sequenced_from':[parent_id]}
    log.debug('parent_id: '+str(parent_link))
    node.links = parent_link

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    node.study         = 'prediabetes'
    node.comment       = record['prep_id'] + ' ... Quality trimmed, cleaned, '\
                            + 'dehosted, converted fastq to fasta.'
    node.format        = record['format'] # only 'fasta', 'fastq' allowed!
    node.format_doc    = 'https://en.wikipedia.org/wiki/' +\
                         record['format'].upper() + '_format'
    node.local_file    = record['local_file']
    node.size          = int(record['size'])
    node.checksums     = {'md5':record['md5'], 'sha256':record['sha256']}
    node.tags = list_tags(node.tags,
                          # 'test', # for debug!!
                          'jaxid (sample): '+record['jaxid_sample'],
                          'jaxid (library): '+record['jaxid_library'] \
                                          if record['jaxid_library'] \
                                          else 'jaxid (library): none',
                          'sample name: '+record['sample_name_id'],
                          'body site: '+record['body_site'],
                          'visit id: '+record['visit_id'],
                          'subject id: '+record['rand_subject_id'],
                          'study: prediabetes',
                          'dna_prep_id: '+ record['prep_id'],
                          'raw_file_id: '+ record['raw_file_id'],
                          )

    log.debug('parent_id: '+str(parent_id))
    node.links = {'computed_from':[parent_id]}

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
def submit(data_file, id_tracking_file=node_tracking_file):
    log.info('Starting submission of %ss.', node_type)
    nodes = []
    csv_fieldnames = get_field_header(data_file)
    write_csv_headers(data_file,fieldnames=csv_fieldnames)
    for record in load_data(data_file):
        log.info('...next record...')
        try:
            log.debug('data record: '+str(record))

            # node-specific variables:
            load_search_field = 'comment'
            internal_id = str(record['host_transcriptomics_id']) +'.host_transcriptomics'
            parent_internal_id = record['host_seq_prep_name_id'] ##Link to Host_seq_prep ID
            grand_parent_internal_id = record['sample_name_id']  ##Link to Sample ID

            parent_id = get_parent_node_id(
                id_tracking_file, parent_type, parent_internal_id)
            log.debug('matched parent_id: %s', parent_id)

            if parent_id:
                node_is_new = False # set to True if newbie
                node = load(internal_id, load_search_field)
                if not getattr(node, load_search_field):
                    log.debug('loaded node newbie...')
                    node_is_new = True

                import pdb ; pdb.set_trace()
                saved = validate_record(parent_id, node, record,
                                        data_file_name=data_file)
                if saved:
                    header = settings.node_id_tracking.id_fields
                    saved_name = getattr(saved, load_search_field)
                    vals = values_to_node_dict(
                        [[node_type.lower(),saved_name,saved.id,
                          parent_type.lower(),parent_internal_id,parent_id]],
                        header
                        )
                    nodes.append(vals)
                    if node_is_new:
                        write_out_csv(id_tracking_file,
                              fieldnames=get_field_header(id_tracking_file),
                              values=vals)
            else:
                log.error('No parent_id found for %s', parent_internal_id)

        except Exception, e:
            log.exception(e)
            raise e
Пример #10
0
def submit(data_file, id_tracking_file=node_tracking_file):
    log.info('Starting submission of %ss.', node_type)
    nodes = []
    csv_fieldnames = get_field_header(data_file)
    write_csv_headers(data_file, fieldnames=csv_fieldnames)
    for record in load_data(data_file):
        # check not 'unknown' jaxid, not missing visit info
        if len(record['visit_id']) > 0:
            log.debug('\n...next record...')
            try:
                log.debug('data record: '+str(record))

                # Node-Specific Variables:
                load_search_field = 'name'
                internal_id = record['sample_name_id']
                parent_internal_id = record['visit_id']
                grand_parent_internal_id = record['rand_subject_id']

                parent_id = get_parent_node_id(
                    id_tracking_file, parent_type, parent_internal_id)

                node_is_new = False # set to True if newbie
                node = load(internal_id, load_search_field)
                if not getattr(node, load_search_field):
                    log.debug('loaded node newbie...')
                    node_is_new = True

                saved = validate_record(parent_id, node, record,
                                        data_file_name=data_file)
                if saved:
                    header = settings.node_id_tracking.id_fields
                    saved_name = getattr(saved, load_search_field)
                    vals = values_to_node_dict(
                        [[node_type.lower(), saved_name, saved.id,
                          parent_type.lower(), parent_internal_id, parent_id]],
                        header
                        )
                    nodes.append(vals)
                    if node_is_new:
                        write_out_csv(id_tracking_file,
                              fieldnames=get_field_header(id_tracking_file),
                              values=vals)

            except Exception, e:
                log.exception(e)
                raise e
        else:
            write_out_csv(data_file+'_records_no_submit.csv',
                          fieldnames=record.keys(), values=[record,])
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    log.info("in validate/save: "+node_type)
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    node.study         = 'prediabetes'
    node.comment       = record['host_transcriptomics_id']
    node.sequence_type = 'nucleotide'
    node.seq_model     = ' '
    node.format        = 'fastq'
    node.format_doc    = 'https://en.wikipedia.org/wiki/FASTQ_format'
    node.exp_length    = 0 #record['exp_length']
    node.urls	       = {record['local_file_path']}
    node.checksums     = {'md5':record['md5'], 'sha256':record['sha']}
    node.size          = int(record['SIZE'])
    #node.tags = list_tags(node.tags,
    #                      'sample name: '     + record['visit_id'],
    #                      'body site: '       + record['body_site'],
    #                      'visit id: '        + record['visit_id'],
    #                      'subject id: '      + record['rand_subject_id'],
    #                      'file prefix: '     + record['sample_name_id'] + '.hostseqprep',
    #                      'file name: '       + record['local_file'],
    #                      'sub-group: '       + record['subtype'],
    #                     )
    parent_link = {'sequenced_from':[parent_id]}
    log.debug('parent_id: '+str(parent_link))
    node.links = parent_link

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
Пример #12
0
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    node.study         = 'prediabetes'
    node.comment       = record['sample_name_id'] + ".metabolome"
    node.format        = record['format'] ## FIX TO HANDLE mzXML files
    node.format_doc    = 'https://en.wikipedia.org/wiki/Mass_spectrometry_data_format'
    node.subtype       = 'host'
    node.checksums     = {'md5':record['MD5Sum'], 'sha256':record['SHA256']}
    node.local_file    = record['url']
    #node.tags          = list_tags(node.tags,
                          #'sample name: '+ record['sample_name_id'] + ".metabolome",
                          #'visit id: '+ record['visit_id'],
                          #'subject id: '+ record['rand_subject_id'],
                          #'sample fluid type: ' + 'Plasma',
                          #'type: ' + record['Type'],
                          #'batch: ' + record['BATCH'],
                          #'mode: ' + record['MODE']
                          #)
    import pdb ; pdb.set_trace()
    log.debug('parent_id: '+str(parent_id))
    node.links = {'derived_from':[parent_id]}

    csv_fieldnames = get_field_header(data_file_name)

    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
Пример #13
0
def submit(data_file, id_tracking_file=node_tracking_file):
    log.info('Starting submission of %ss.', node_type)
    nodes = []
    csv_fieldnames = get_field_header(data_file)
    write_csv_headers(data_file,fieldnames=csv_fieldnames)
    for record in load_data(data_file):
        log.info('\n...next record...')
        try:
            log.debug('data record: '+str(record))

            # node-specific variables:
            load_search_field = 'comment'
            internal_id = record['sample_name_id'] + '.proteome'
            parent_internal_id = record['sample_name_id'] + '.hostassayprep'
            grand_parent_internal_id = record['visit_id']

            parent_id = get_parent_node_id(
                id_tracking_file, parent_type, parent_internal_id)

            node_is_new = False # set to True if newbie
            node = load(internal_id, load_search_field)
            if not getattr(node, load_search_field):
                log.debug('loaded node newbie...')
                node_is_new = True

	    import pdb ; pdb.set_trace()
            saved = validate_record(parent_id, node, record,
                                    data_file_name=data_file)
	    if saved:
                header = settings.node_id_tracking.id_fields
                saved_name = getattr(saved, load_search_field)
                vals = values_to_node_dict(
                    [[node_type.lower(),saved_name,saved.id,
                      parent_type.lower(),parent_internal_id,parent_id]],
                    header
                    )
                nodes.append(vals)
                if node_is_new:
                    write_out_csv(id_tracking_file,
                          fieldnames=get_field_header(id_tracking_file),
                          values=vals)

        except Exception, e:
            log.exception(e)
            raise e
Пример #14
0
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    log.info("in validate/save: "+node_type)
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name, fieldnames=csv_fieldnames)

    node.name = record['sample_name_id']
    node.body_site = record['body_site'].lower()
    fma_body_site = record['fma_body_site']
    node.fma_body_site = fma_body_site
    node.mixs = generate_mixs(record)
    node.tags = list_tags(node.tags,
            # 'test', # for debug!!
            'stanford_id: ' + record['sample_name_id'],
            'visit id: ' +record['visit_id'],
            'subject id: ' +record['rand_subject_id'],
            'study: ' +'prediabetes',
            'sub_study: ' +record['sub_study'],
            'visit type: ' +record['visit_type']
            )
    # node._attribs = record['attributes']

    parent_link = {'collected_during':[parent_id]}
    log.debug('parent_id: '+str(parent_id))
    node.links = parent_link

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
def submit(data_file, id_tracking_file=node_tracking_file):
    log.info('Starting submission of %ss.', node_type)
    nodes = []
    csv_fieldnames = get_field_header(data_file)
    write_csv_headers(data_file,fieldnames=csv_fieldnames)
    for record in load_data(data_file):
        log.info('\n...next record...')
        try:
            log.debug('data record: '+str(record))

            if record['local_file'] != '':
                load_search_field = 'local_file'
                internal_id = os.path.basename(record['local_file'])
                parent_internal_id = record['raw_file_id']
                grand_parent_internal_id = record['prep_id']

                parent_id = get_parent_node_id(
                    id_tracking_file, parent_type, parent_internal_id)

                node_is_new = False # set to True if newbie
                node = load(internal_id, load_search_field)
                if not getattr(node, load_search_field):
                    log.debug('loaded node newbie...')
                    node_is_new = True

                saved = validate_record(parent_id, node, record,
                                        data_file_name=data_file)
                if saved:
                    header = settings.node_id_tracking.id_fields
                    vals = values_to_node_dict(
                            [[node_type.lower(),saved_name,saved.id,
                              parent_type.lower(),parent_internal_id,parent_id]],
                            header
                            )
                    nodes.append(vals)
                    if node_is_new:
                        write_out_csv(id_tracking_file,
                              fieldnames=get_field_header(id_tracking_file),
                              values=vals)

        except Exception, e:
            log.exception(e)
            raise e
Пример #16
0
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    log.info("in validate/save: "+node_type)
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    node.visit_id = record['visit_id']
    node.visit_number = int(record['visit_number'])
    node.interval = int(record['interval'])
    node.tags = list_tags(node.tags,
                # 'test', # for debug!!
                'rand_subject_id: '+record['rand_subject_id'],
                'study: prediabetes',
                # 'study: '+record['study'],
                # 'sub_study: '+record['sub_study'],
                )
    log.debug('parent_id: '+str(parent_id))
    node.links = {'by':[parent_id]}

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
            fieldnames=csv_fieldnames,values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames,values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames,values=[record,])
        return False
Пример #17
0
def validate_record(parent_id, node, record, data_file_name=node_type):
	"""update record fields
	   validate node
	   if valid, save, if not, return false
	"""
	log.info("in validate/save: "+node_type)
	csv_fieldnames = get_field_header(data_file_name)

	node.name = record['sample_name_id']
	node.body_site = record['body_site'].lower()
	node.fma_body_site = record['fma_body_site']
	node.mixs = generate_mixs(record)
	node.tags = list_tags(
			'sample id: ' + record['sample_name_id'],
			'visit id: ' + record['DCC_VISIT_IDS'],
			'subject id: ' + record['rand_patient_id'],
			'study: prediabetes',
			#'consented: ' + record['consented'],
			)
	# node._attribs = record['attributes']

	parent_link = {'collected_during':[parent_id]}
	log.debug('parent_id: '+str(parent_link))
	node.links = parent_link

	if not node.is_valid():
		invalids = data_file_name[:-4]+'.invalid_records.csv'
		write_csv_headers(invalids, fieldnames=csv_fieldnames)
		write_out_csv(invalids, fieldnames=csv_fieldnames,
					  values=[record,])
		invalidities = node.validate()
		err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
		log.error(err_str)
		# raise Exception(err_str)
	elif node.save():
		submitted = data_file_name[:-4]+'.submitted.csv'
		write_csv_headers(submitted, fieldnames=csv_fieldnames)
		write_out_csv(submitted, fieldnames=csv_fieldnames,
					  values=[record,])
		return node
	else:
		unsaved = data_file_name[:-4]+'.unsaved.csv'
		write_csv_headers(unsaved, fieldnames=csv_fieldnames)
		write_out_csv(unsaved, fieldnames=csv_fieldnames,
					  values=[record,])
		return False
Пример #18
0
def validate_record(parent_id, node, record, data_file_name=node_type):
	"""update record fields
	   validate node
	   if valid, save, if not, return false
	"""
	log.info("in validate/save: "+node_type)
	csv_fieldnames = get_field_header(data_file_name)
	node.study = 'prediabetes'
	node.subtype = 'prediabetes'
	node.tags = list_tags('Race: ' + get_race(record['race_code']),
		'age: ' + record['age'],
		'gender: ' + get_gender(record['gender']),
		)

	parent_link = {'associated_with':[parent_id]}
	log.debug('parent_id: '+str(parent_link))
	node.links = parent_link

	if not node.is_valid():
		invalids = data_file_name[:-4]+'.invalid_records.csv'
		write_csv_headers(invalids, fieldnames=csv_fieldnames)
		write_out_csv(invalids, fieldnames=csv_fieldnames,
					  values=[record,])
		invalidities = node.validate()
		err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
		log.error(err_str)
		# raise Exception(err_str)
	elif node.save():
		submitted = data_file_name[:-4]+'.submitted.csv'
		write_csv_headers(submitted, fieldnames=csv_fieldnames)
		write_out_csv(submitted, fieldnames=csv_fieldnames,
					  values=[record,])
		return node
	else:
		unsaved = data_file_name[:-4]+'.unsaved.csv'
		write_csv_headers(unsaved, fieldnames=csv_fieldnames)
		write_out_csv(unsaved, fieldnames=csv_fieldnames,
					  values=[record,])
		return False
Пример #19
0
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    log.info("in validate/save: "+node_type)
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    #print("Calculating md5sum.")
    #md5sum = hashlib.md5()
    #    with open(record['local_url'], 'rb') as f:
    #    for chunk in iter(lambda: f.read(1024*1024), ''):
    #        md5sum.update(chunk)

    node.study         = 'prediabetes'
    node.comment       = record['host_assay_prep_id']
    node.sample_name   = record['sample_name_id']
    node.contact       = 'Wenyu Zhou'
    node.center        = 'Stanford'
    node.format        = 'csv'
    node.format_doc    = 'https://en.wikipedia.org/wiki/csv'
    node.exp_length    = 0 #record['exp_length']
    #node.local_file    = record['DCC_File_Path']
    node.experiment_type    = record['experiment_type']
    node.title         = 'T2D Prediabetes Proteomics'
    node.prep_id       = record['rand_patient_id']
    node.pride_id      = 'null'
    #node.checksums     = {'md5': md5.hexdigest(), 'sha256':record['sha256']}
    node.storage_duration = 0
    #node.size          = int(record['FileSize'])
    node.protocol_steps = ''
    node.species        = 'H**o sapiens (Human)'
    node.subtype        = ''
    node.tissue         = 'blood'
    #node.tags = list_tags(node.tags,
    #                      # 'test', # for debug!!
    #                      'visit id: ' + record['visit_id'],
    #                      'subject id: ' + record['rand_subject_id'],
    #)
    
    parent_link = {'prepared_from':[parent_id]}
    log.debug('parent_id: '+str(parent_link))
    node.links = parent_link

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
Пример #20
0
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false"""
    
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    #node.prepared_by   = 'Sara Ahadi'
    #node.sequence_type = 'nucleotide'
    #node.format        = 'mzml'
    #node.format_doc    = 'https://en.wikipedia.org/wiki/'
    #node.exp_length    = 0 #record['exp_length']
    node.raw_url.append(record['DCC_File_Path'])
    #node.size          = int(record['size'])
    #node.tags = list_tags(node.tags,
                          # 'test', # for debug!!
                          #'sample name: '+record['sample_name_id'],
                          #'visit id: '+record['visit_id'],
                          #'subject id: '+record['rand_subject_id'],
                          #'file name: '+ record['sample_name_id'] + 'mxML.gz',
    #                     )
    #node.analyzer = 'TOF(Time of Flight)'
    node.comment = record['sample_name_id']
    node.checksums = {'md5':record['MD5Sum'], 'sha256':record['SHA256']}
    node.data_processing_protocol = 'Targeted Data Independent analysis'
    #node.detector = ''
    #node.exp_description = 'Protein profiling of more than 900 samples from pre-diabetic and diabetic participants and different time points of healthy, viral infection and immunization.'
    #node.instrument_name = ''
    #node.pepid_url.append(record['DCC_File_Path'])
    #node.pepid_url.remove('')
    #node.pride_id = ''
    node.processing_method = 'Targeted Data independent analysis with OpenSwath'
    #node.peak_url = ''
    node.protocol_name = 'SWATH_Proteomics (attached)'
    node.sample_name = 'Plasma'
    node.search_engine = 'ProteinPilot Paragon database search algorithm'
    #node.short_label = ''
    #node.software = 'ProteinPilot 5.0.1, OpenSwath, PyProphet, TRIC'
    node.source = 'DuoSpray Ion Source'
    node.subtype = 'host'
    #node.raw_url = ''
    #node.result_url = ''
    #node.other_url = ''
    node.study = 'prediabetes'
    #node.tags = ()
    node.title = 'T2D Prediabetes'

#Targeted Immunoproteomics

    parent_link = {'derived_from':[parent_id]}
    log.debug('parent_id: '+str(parent_link))
    node.links = parent_link

    csv_fieldnames = get_field_header(data_file_name)
    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
Пример #21
0
def validate_record(parent_id, node, record, data_file_name=node_type):
    """update record fields
       validate node
       if valid, save, if not, return false
    """
    log.info("in validate/save: "+node_type)
    csv_fieldnames = get_field_header(data_file_name)
    write_csv_headers(data_file_name,fieldnames=csv_fieldnames)

    #print("Calculating md5sum.")
    #md5sum = hashlib.md5()
    #    with open(record['local_url'], 'rb') as f:
    #    for chunk in iter(lambda: f.read(1024*1024), ''):
    #        md5sum.update(chunk)

    node.study         = 'prediabetes'
    node.comment       = record['sample_name_id'] + '.cytokine'
    node.checksums     = {'md5':record['md5'], 'sha256':record['sha256']}
    node.subtype       = 'prediabetes'
#    node.urls          = record['local_file']
    node.format_doc    = 'https://en.wikipedia.org/wiki/Tab-separated_values'
    node.format        = 'tsv'
#    node.sample_name   = record['SAMPLE_PARENT_ID']
#    node.contact       = record['sequencing_contact']
#    node.center        = record['center']
#    node.format        = 'mzXML'
#    node.format_doc    = 'https://en.wikipedia.org/wiki/Mass_spectrometry_data_format'
#    node.exp_length    = 0 #record['exp_length']
    node.local_file    = record['FileLocation']
#    node.experiment_type    = 'Untargeted metabolomics'
#    node.title         = record['title']
#    node.prep_id       = record['prep_id']
#    node.pride_id      = 'null'
#    node.storage_duration = 0
#    node.size          = int(record['size'])
#    node.tags = list_tags(node.tags,
#                          # 'test', # for debug!!
#                          'visit id: '+record['visit_id'],
#                          'subject id: '+record['rand_subject_id'],
#                          'file prefix: '+ record['prep_id'],
#                          'file name: '+ str(record['FILE_NAME']),
#                         )

    parent_link = {'derived_from':[parent_id]}
    log.debug('parent_id: '+str(parent_link))
    node.links = parent_link

    csv_fieldnames = get_field_header(data_file_name)

    if not node.is_valid():
        write_out_csv(data_file_name+'_invalid_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        invalidities = node.validate()
        err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
        log.error(err_str)
        # raise Exception(err_str)
    elif node.save():
        write_out_csv(data_file_name+'_submitted.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return node
    else:
        write_out_csv(data_file_name+'_unsaved_records.csv',
                      fieldnames=csv_fieldnames, values=[record,])
        return False
Пример #22
0
	   if valid, save, if not, return false
	"""
	log.info("in validate/save: "+node_type)
	csv_fieldnames = get_field_header(data_file_name)

	node.study = 'prediabetes'
    node.subtype = 'prediabetes'
    node.tags = list_tags('')

	parent_link = {'associated_with':[parent_id]}
	log.debug('parent_id: '+str(parent_link))
	node.links = parent_link

	if not node.is_valid():
		invalids = data_file_name[:-4]+'.invalid_records.csv'
		write_csv_headers(invalids, fieldnames=csv_fieldnames)
		write_out_csv(invalids, fieldnames=csv_fieldnames,
					  values=[record,])
		invalidities = node.validate()
		err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities))
		log.error(err_str)
		# raise Exception(err_str)
	elif node.save():
		submitted = data_file_name[:-4]+'.submitted.csv'
		write_csv_headers(submitted, fieldnames=csv_fieldnames)
		write_out_csv(submitted, fieldnames=csv_fieldnames,
					  values=[record,])
		return node
	else:
		unsaved = data_file_name[:-4]+'.unsaved.csv'
		write_csv_headers(unsaved, fieldnames=csv_fieldnames)