def create_db_obj_from_txt_GENEINDEXED(txt_fname, title, indexColumnNames='gene'): data_0, colnames = read_delim(txt_fname, return_list=False) data = read_data_and_add_title_str_to_each_header(data_0, title, indexColumnNames) new_colnames = [ '%s_%s' % (title, c.strip()) for c in colnames if c not in [indexColumnNames] ] annotation_table = dict() for d in data: if d[indexColumnNames] in annotation_table: print("ERROR: " + indexColumnNames + " indices must be unique: %s found multiple times in %s!" % (d[indexColumnNames], txt_fname)) sys.exit(1) annotation_table[d[indexColumnNames]] = dict() for field in new_colnames: idx_key = d[indexColumnNames] annotation_table[idx_key][field] = d[field].strip() return annotation_table, new_colnames
def create_db_obj_from_txt_POSINDEXED(txt_fname, title, indexColumnNames='chr,start,end'): data_0, colnames = read_delim(txt_fname, return_list=False) data = read_data_and_add_title_str_to_each_header(data_0, title, indexColumnNames) indexColumnList = indexColumnNames.split(',') if len(set(colnames).intersection(set(indexColumnList))) <> len( set(indexColumnList)): print("ERROR: Invalid header values!") print("If indexing by genomic position, headers must include '" + indexColumnNames + "'.") raise Exception( "If indexing by genomic position, headers must include '" + indexColumnNames + "'.") excludeList = indexColumnList excludeList.append('build') new_colnames = [ '%s_%s' % (title, c) for c in colnames if c not in excludeList ] annotation_table = dict() chr = indexColumnList[0] start = indexColumnList[1] end = indexColumnList[2] for d in data: if d[chr].startswith('chr'): d[chr] = d[chr][3:] d['bin'] = region2bin(int(d[start]), int(d[end])) d['start'] = d[start] d['end'] = d[end] data = sorted(data, key=operator.itemgetter(end)) data = sorted(data, key=operator.itemgetter(start)) data = sorted(data, key=operator.itemgetter(chr)) for d in data: if d[chr] in annotation_table: if d['bin'] in annotation_table[d[chr]]: annotation_table[d[chr]][d['bin']].append(d) else: annotation_table[d[chr]][d['bin']] = [d] else: annotation_table[d[chr]] = {d['bin']: [d]} return annotation_table, new_colnames
def create_db_obj_from_txt_GENEINDEXED(txt_fname, title, indexColumnNames='gene'): data_0, colnames = read_delim(txt_fname, return_list=False) data = read_data_and_add_title_str_to_each_header(data_0, title, indexColumnNames) new_colnames = ['%s_%s' % (title, c.strip()) for c in colnames if c not in [indexColumnNames]] annotation_table = dict() for d in data: if d[indexColumnNames] in annotation_table: print("ERROR: " + indexColumnNames + " indices must be unique: %s found multiple times in %s!" % (d[indexColumnNames], txt_fname)) sys.exit(1) annotation_table[d[indexColumnNames]] = dict() for field in new_colnames: annotation_table[d[indexColumnNames]][field] = d[field].strip() return annotation_table, new_colnames
def create_db_obj_from_txt_POSINDEXED(txt_fname, title, indexColumnNames='chr,start,end'): data_0, colnames = read_delim(txt_fname, return_list=False) data = read_data_and_add_title_str_to_each_header(data_0, title, indexColumnNames) indexColumnList = indexColumnNames.split(',') if len(set(colnames).intersection( set(indexColumnList) )) <> len(set(indexColumnList)): print("ERROR: Invalid header values!") print("If indexing by genomic position, headers must include '" + indexColumnNames + "'.") raise Exception("If indexing by genomic position, headers must include '" + indexColumnNames + "'.") excludeList = indexColumnList excludeList.append('build') new_colnames = ['%s_%s' % (title, c) for c in colnames if c not in excludeList] annotation_table = dict() chr = indexColumnList[0] start = indexColumnList[1] end = indexColumnList[2] for d in data: if d[chr].startswith('chr'): d[chr] = d[chr][3:] d['bin'] = region2bin(int(d[start]), int(d[end])) d['start'] = d[start] d['end'] = d[end] data = sorted(data, key=operator.itemgetter(end)) data = sorted(data, key=operator.itemgetter(start)) data = sorted(data, key=operator.itemgetter(chr)) for d in data: if d[chr] in annotation_table: if d['bin'] in annotation_table[d[chr]]: annotation_table[d[chr]][d['bin']].append(d) else: annotation_table[d[chr]][d['bin']] = [d] else: annotation_table[d[chr]] = {d['bin']:[d]} return annotation_table, new_colnames