예제 #1
0
파일: db.py 프로젝트: xingtech/oncotator
def create_db_obj_from_txt_GENEINDEXED(txt_fname,
                                       title,
                                       indexColumnNames='gene'):
    data_0, colnames = read_delim(txt_fname, return_list=False)
    data = read_data_and_add_title_str_to_each_header(data_0, title,
                                                      indexColumnNames)

    new_colnames = [
        '%s_%s' % (title, c.strip()) for c in colnames
        if c not in [indexColumnNames]
    ]

    annotation_table = dict()

    for d in data:
        if d[indexColumnNames] in annotation_table:
            print("ERROR: " + indexColumnNames +
                  " indices must be unique: %s found multiple times in %s!" %
                  (d[indexColumnNames], txt_fname))
            sys.exit(1)

        annotation_table[d[indexColumnNames]] = dict()
        for field in new_colnames:
            idx_key = d[indexColumnNames]
            annotation_table[idx_key][field] = d[field].strip()

    return annotation_table, new_colnames
예제 #2
0
def create_db_obj_from_txt_POSINDEXED(txt_fname,
                                      title,
                                      indexColumnNames='chr,start,end'):
    data_0, colnames = read_delim(txt_fname, return_list=False)
    data = read_data_and_add_title_str_to_each_header(data_0, title,
                                                      indexColumnNames)
    indexColumnList = indexColumnNames.split(',')

    if len(set(colnames).intersection(set(indexColumnList))) <> len(
            set(indexColumnList)):

        print("ERROR: Invalid header values!")
        print("If indexing by genomic position, headers must include '" +
              indexColumnNames + "'.")
        raise Exception(
            "If indexing by genomic position, headers must include '" +
            indexColumnNames + "'.")

    excludeList = indexColumnList
    excludeList.append('build')

    new_colnames = [
        '%s_%s' % (title, c) for c in colnames if c not in excludeList
    ]

    annotation_table = dict()

    chr = indexColumnList[0]
    start = indexColumnList[1]
    end = indexColumnList[2]

    for d in data:
        if d[chr].startswith('chr'): d[chr] = d[chr][3:]

        d['bin'] = region2bin(int(d[start]), int(d[end]))
        d['start'] = d[start]
        d['end'] = d[end]
    data = sorted(data, key=operator.itemgetter(end))
    data = sorted(data, key=operator.itemgetter(start))
    data = sorted(data, key=operator.itemgetter(chr))

    for d in data:
        if d[chr] in annotation_table:
            if d['bin'] in annotation_table[d[chr]]:
                annotation_table[d[chr]][d['bin']].append(d)
            else:
                annotation_table[d[chr]][d['bin']] = [d]
        else:
            annotation_table[d[chr]] = {d['bin']: [d]}

    return annotation_table, new_colnames
예제 #3
0
파일: db.py 프로젝트: alexramos/oncotator
def create_db_obj_from_txt_GENEINDEXED(txt_fname, title, indexColumnNames='gene'):
	data_0, colnames = read_delim(txt_fname, return_list=False)
	data = read_data_and_add_title_str_to_each_header(data_0, title, indexColumnNames)
		
	new_colnames = ['%s_%s' % (title, c.strip()) for c in colnames if c not in [indexColumnNames]]

	annotation_table = dict()
	
	for d in data:
		if d[indexColumnNames] in annotation_table:
			print("ERROR: " + indexColumnNames + " indices must be unique: %s found multiple times in %s!" % (d[indexColumnNames], txt_fname))
			sys.exit(1)

		annotation_table[d[indexColumnNames]] = dict()
		for field in new_colnames:
			annotation_table[d[indexColumnNames]][field] = d[field].strip()
			
	return annotation_table, new_colnames
예제 #4
0
파일: db.py 프로젝트: alexramos/oncotator
def create_db_obj_from_txt_POSINDEXED(txt_fname, title, indexColumnNames='chr,start,end'):
	data_0, colnames = read_delim(txt_fname, return_list=False)
	data = read_data_and_add_title_str_to_each_header(data_0, title, indexColumnNames)
	indexColumnList = indexColumnNames.split(',')
	
	if len(set(colnames).intersection( set(indexColumnList) )) <> len(set(indexColumnList)):
	
		print("ERROR: Invalid header values!")
		print("If indexing by genomic position, headers must include '" + indexColumnNames + "'.")
		raise Exception("If indexing by genomic position, headers must include '" + indexColumnNames + "'.")
	
	excludeList = indexColumnList
	excludeList.append('build')
		
	new_colnames = ['%s_%s' % (title, c) for c in colnames if c not in excludeList]

	annotation_table = dict()

	chr = indexColumnList[0]
	start = indexColumnList[1]
	end = indexColumnList[2]
	
	for d in data:
		if d[chr].startswith('chr'): d[chr] = d[chr][3:]

		d['bin'] = region2bin(int(d[start]), int(d[end]))
		d['start'] = d[start]
		d['end'] = d[end] 	
	data = sorted(data, key=operator.itemgetter(end))
	data = sorted(data, key=operator.itemgetter(start))
	data = sorted(data, key=operator.itemgetter(chr))

	for d in data:
		if d[chr] in annotation_table:
			if d['bin'] in annotation_table[d[chr]]:
				annotation_table[d[chr]][d['bin']].append(d)
			else:
				annotation_table[d[chr]][d['bin']] = [d]
		else:
			annotation_table[d[chr]] = {d['bin']:[d]}
			
	return annotation_table, new_colnames