def test_append_line(self): """Append a line to a file """ tabfile = TabFile('test',self.fp,first_line_is_header=True,delimiter=',') line = 'chr3,10,9,8' tabfile.append(tabdata=line) self.assertEqual(str(tabfile[-1]),line)
def test_transpose_tab_file(self): """Test transposing TabFile """ tabfile1 = TabFile('test',self.fp,first_line_is_header=False) tabfile2 = tabfile1.transpose() self.assertEqual(len(tabfile1),tabfile2.nColumns()) self.assertEqual(len(tabfile2),tabfile1.nColumns())
def test_append_line_as_data(self): """Append a line to a file with data supplied as a list """ tabfile = TabFile('test',self.fp,first_line_is_header=True,delimiter=',') data = ['chr3','10','9','8'] tabfile.append(data=data) self.assertEqual(str(tabfile[-1]),','.join([str(x) for x in data]))
def test_write_data_with_header(self): """Write data to file-like object including a header line """ tabfile = TabFile('test',self.fp,first_line_is_header=True,delimiter=',') fp = cStringIO.StringIO() tabfile.write(fp=fp,include_header=True) self.assertEqual(fp.getvalue(),self.header.replace('\t',',')+self.data.replace('\t',',')) fp.close()
def test_add_tab_data_to_new_tabfile(self): """Test adding data as a tab-delimited line to a new empty TabFile """ data = 'chr1\t10000\t20000\t+' tabfile = TabFile() tabfile.append(tabdata=data) self.assertEqual(len(tabfile),1,"TabFile should now have one line") self.assertEqual(str(tabfile[0]),data)
def test_reverse_sort_on_column(self): """Sort data on a numerical column into (reverse) descending order """ tabfile = TabFile('test',self.fp,first_line_is_header=True) tabfile.sort(lambda line: line['data'],reverse=True) sorted_data = [6.8,5.7,3.4] for i in range(len(tabfile)): self.assertEqual(tabfile[i]['data'],sorted_data[i])
def test_write_data(self): """Write data to file-like object """ tabfile = TabFile('test',self.fp) fp = cStringIO.StringIO() tabfile.write(fp=fp) self.assertEqual(fp.getvalue(),self.data) fp.close()
def test_add_data_to_new_tabfile(self): """Test adding data as a list of items to a new empty TabFile """ data = ['chr1',10000,20000,'+'] tabfile = TabFile() tabfile.append(data=data) self.assertEqual(len(tabfile),1,"TabFile should now have one line") for i in range(len(data)): self.assertEqual(tabfile[0][i],data[i])
def test_unexpected_uncommented_header(self): """Test reading in a tab file with an unexpected uncommented header """ tabfile = TabFile('test',self.fp) self.assertEqual(len(tabfile),4,"Input has 4 lines of data") self.assertEqual(tabfile.header(),[],"Wrong header") self.assertEqual(str(tabfile[0]),"chr\tstart\tend\tdata","Incorrect string representation") self.assertRaises(KeyError,tabfile[3].__getitem__,'chr') self.assertEqual(tabfile.nColumns(),4)
def test_expected_uncommented_header(self): """Test reading in a tab file with an expected uncommented header """ tabfile = TabFile('test',self.fp,first_line_is_header=True) self.assertEqual(len(tabfile),3,"Input has 3 lines of data") self.assertEqual(tabfile.header(),['chr','start','end','data'],"Wrong header") self.assertEqual(str(tabfile[0]),"chr1\t1\t234\t4.6","Incorrect string representation") self.assertEqual(tabfile[2]['chr'],'chr2',"Incorrect data") self.assertEqual(tabfile.nColumns(),4)
def test_load_data_with_header(self): """Create and load Tabfile using first line as header """ tabfile = TabFile('test',self.fp,first_line_is_header=True) self.assertEqual(len(tabfile),3,"Input has 3 lines of data") self.assertEqual(tabfile.header(),['chr','start','end','data'],"Wrong header") self.assertEqual(str(tabfile[0]),"chr1\t1\t234\t4.6","Incorrect string representation") self.assertEqual(tabfile[2]['chr'],'chr2',"Incorrect data") self.assertEqual(tabfile.nColumns(),4)
def test_load_data(self): """Create and load new TabFile instance """ tabfile = TabFile('test',self.fp,delimiter=',') self.assertEqual(len(tabfile),3,"Input has 3 lines of data") self.assertEqual(tabfile.header(),[],"Header should be empty") self.assertEqual(str(tabfile[0]),"chr1,1,234,4.6","Incorrect string representation") self.assertEqual(tabfile[2][0],'chr2',"Incorrect data") self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.filename(),'test')
def test_insert_tab_data_line(self): """Insert a TabDataLine into a TabFile """ tabfile = TabFile('test',self.fp) self.assertEqual(len(tabfile),3) tabdataline = TabDataLine('chr1\t10000\t20000\t+') line = tabfile.insert(2,tabdataline=tabdataline) self.assertEqual(len(tabfile),4) # Check new line is correct self.assertTrue(line is tabdataline)
def test_insert_line_with_tab_data(self): """Insert line into a TabFile populated from tabbed data """ data = 'chr1\t10000\t20000\t+' tabfile = TabFile('test',self.fp) self.assertEqual(len(tabfile),3) line = tabfile.insert(2,tabdata=data) self.assertEqual(len(tabfile),4) # Check new line is correct self.assertTrue(str(line) == data)
def test_insert_empty_line(self): """Insert a blank line into a TabFile """ tabfile = TabFile('test',self.fp) self.assertEqual(len(tabfile),3) line = tabfile.insert(2) self.assertEqual(len(tabfile),4) # Check new line is empty for i in range(len(line)): self.assertTrue(str(line[i]) == '')
def test_append_tab_data_line(self): """Append a TabDataLine to a TabFile """ tabfile = TabFile('test',self.fp) self.assertEqual(len(tabfile),3) tabdataline = TabDataLine('chr1\t10000\t20000\t+') line = tabfile.append(tabdataline=tabdataline) self.assertEqual(len(tabfile),4) # Check new line is correct self.assertTrue(line is tabdataline)
def test_append_empty_line(self): """Append a blank line to a TabFile """ tabfile = TabFile('test',self.fp) self.assertEqual(len(tabfile),3) line = tabfile.append() self.assertEqual(len(tabfile),4) # Check new line is empty for i in range(len(line)): self.assertTrue(str(line[i]) == '')
def test_load_data_setting_explicit_header(self): """Create and load TabFile setting the header explicitly """ tabfile = TabFile('test',self.fp,first_line_is_header=True, column_names=('CHROM','START','STOP','VALUES')) self.assertEqual(len(tabfile),3,"Input has 3 lines of data") self.assertEqual(tabfile.header(),['CHROM','START','STOP','VALUES'],"Wrong header") self.assertEqual(str(tabfile[0]),"chr1\t1\t234\t4.6","Incorrect string representation") self.assertEqual(tabfile[2]['CHROM'],'chr2',"Incorrect data") self.assertEqual(tabfile.nColumns(),4)
def test_insert_line_with_data(self): """Insert line into a TabFile populated with data """ data = ['chr1',678,901,6.1] tabfile = TabFile('test',self.fp) self.assertEqual(len(tabfile),3) line = tabfile.insert(2,data=data) self.assertEqual(len(tabfile),4) # Check new line is correct for i in range(len(data)): self.assertTrue(line[i] == data[i])
def test_apply_operation_to_column(self): """Divide values in a column by 10 """ tabfile = TabFile('test',self.fp,first_line_is_header=True) # Check number of columns and header items self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.header(),['chr','start','end','data']) # Divide data column by 10 tabfile.transformColumn('data',lambda x: x/10) results = [0.46,0.57,0.68] for i in range(len(tabfile)): self.assertEqual(tabfile[i]['data'],results[i])
def test_get_index_for_line_number(self): """Look up line numbers from a TabFile """ tabfile = TabFile('test',self.fp) # Look for an existing line self.assertEqual(tabfile.indexByLineNumber(2),0) self.assertEqual(tabfile[tabfile.indexByLineNumber(2)].lineno(),2) # Look for the first line in the file (the commented header) self.assertRaises(IndexError,tabfile.indexByLineNumber,1) # Look for a generally non-existant line number self.assertRaises(IndexError,tabfile.indexByLineNumber,-12) # Look for a negative line number self.assertRaises(IndexError,tabfile.indexByLineNumber,99)
def test_change_delimiter_for_write(self): """Write data out with different delimiter to input """ tabfile = TabFile('test',self.fp,delimiter=',') # Modified delimiter (tab) fp = cStringIO.StringIO() tabfile.write(fp=fp,delimiter='\t') self.assertEqual(fp.getvalue(),self.data) fp.close() # Default (should revert to comma) fp = cStringIO.StringIO() tabfile.write(fp=fp) self.assertEqual(fp.getvalue(),self.data.replace('\t',',')) fp.close()
def test_lookup(self): """Look up data from a TabFile """ tabfile = TabFile('test',self.fp,first_line_is_header=True) # Look for lines with 'chr1' in the chr column matching = tabfile.lookup('chr','chr1') self.assertEqual(len(matching),2) for m in matching: self.assertEqual(m['chr'],'chr1',"Lookup returned bad match: '%s'" % m) self.assertNotEqual(matching[0],matching[1]) # Look for lines with 'chr2' in the chr column matching = tabfile.lookup('chr','chr2') self.assertEqual(len(matching),1) self.assertEqual(matching[0]['chr'],'chr2',"Lookup returned bad match: '%s'" % m) # Look for lines with 'bananas' in the chr column self.assertEqual(len(tabfile.lookup('chr','bananas')),0)
def _load_data(self, fp): """Internal: populate with data from external file Arguments fp: file-like object opened for reading which contains sample sheet data """ section = None for i, line in enumerate(fp): line = line.rstrip() logging.debug(line) if not line: # Skip blank lines continue if line.startswith('['): # New section try: i = line.index(']') section = line[1:i] continue except ValueError: logging.error("Bad line (#%d): %s" % (i + 1, line)) if section == 'Header': # Header lines are comma-separated PARAM,VALUE lines self._set_param_value(line, self._header) elif section == 'Reads': # Read lines are one value per line value = line.rstrip(',') if value: self._reads.append(value) elif section == 'Settings': # Settings lines are comma-separated PARAM,VALUE lines self._set_param_value(line, self._settings) elif section == 'Data': # Store data in TabFile object if self._data is None: # Initialise TabFile using this first line # to set the header self._data = TabFile.TabFile(column_names=line.split(','), delimiter=',') else: self._data.append(tabdata=line) elif section is None: raise IlluminaDataError("Not a valid IEM sample sheet?") else: raise IlluminaDataError( "Unrecognised section '%s': not a valid IEM sample sheet?" % section) # Clean up data items: remove surrounding whitespace if self._data is not None: for line in self._data: for item in self._data.header(): try: line[item] = line[item].strip() except AttributeError: pass
def test_set_column_to_constant_value(self): """Set a column to a constant value using transformColumn """ tabfile = TabFile('test',self.fp,first_line_is_header=True) # Check number of columns and header items self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.header(),['chr','start','end','data']) # Add a strand column tabfile.appendColumn('strand') self.assertEqual(tabfile.nColumns(),5) self.assertEqual(tabfile.header(),['chr','start','end','data','strand']) # Set all values to '+' tabfile.transformColumn('strand',lambda x: '+') for line in tabfile: self.assertEqual(line['strand'],'+')
def annotate_feature_data(gff_lookup,feature_data_file,out_file): """Annotate feature data with gene information Reads in 'feature data' from a tab-delimited input file with feature IDs in the first column; outputs these data with data about the parent gene appended to each line. Arguments: gff_lookup populated GFFAnnotationLookup instance feature_data_file input data file with feature IDs in first column out_file name of output file """ # Read the feature data into a TabFile print "Reading in data from %s" % feature_data_file feature_data = TabFile.TabFile(filen=feature_data_file, first_line_is_header=True) # Append columns for annotation print "Appending columns for annotation" for colname in ('exon_parent', 'feature_type_exon_parent', 'gene_ID', 'gene_name', 'chr', 'start', 'end', 'strand', 'gene_length', 'locus', 'description'): feature_data.appendColumn(colname) for line in feature_data: feature_ID = line[0] annotation = gff_lookup.getAnnotation(feature_ID) line['exon_parent'] = annotation.parent_feature_name line['feature_type_exon_parent'] = annotation.parent_feature_type line['gene_ID'] = annotation.parent_feature_parent line['gene_name'] = annotation.parent_gene_name line['chr'] = annotation.chr line['start'] = annotation.start line['end'] = annotation.end line['strand'] = annotation.strand line['gene_length'] = annotation.gene_length line['locus'] = annotation.gene_locus line['description'] = annotation.description # Output print "Writing output file %s" % out_file feature_data.write(out_file,include_header=True,no_hash=True)
def test_append_column(self): """Append new column to a Tabfile """ tabfile = TabFile('test',self.fp,first_line_is_header=True) self.assertEqual(len(tabfile.header()),4) tabfile.appendColumn('new') self.assertEqual(len(tabfile.header()),5) self.assertEqual(tabfile.header()[4],'new') self.assertEqual(tabfile[0]['new'],'')
def test_compute_and_overwrite_existing_column_integer_index(self): """Compute new values for an existing column referenced using integer index """ tabfile = TabFile('test',self.fp,first_line_is_header=True) # Check number of columns and header items self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.header(),['chr','start','end','data']) # Compute new values for data column tabfile.computeColumn(3,lambda line: line['end'] - line['start']) self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.header(),['chr','start','end','data']) results = [233,323,4444] for i in range(len(tabfile)): self.assertEqual(tabfile[i]['data'],results[i])
def test_compute_midpoint(self): """Compute the midpoint of the start and end columns """ tabfile = TabFile('test',self.fp,first_line_is_header=True) # Check number of columns and header items self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.header(),['chr','start','end','data']) # Compute midpoint of start and end tabfile.computeColumn('midpoint',lambda line: (line['end'] + line['start'])/2.0) self.assertEqual(tabfile.nColumns(),5) self.assertEqual(tabfile.header(),['chr','start','end','data','midpoint']) results = [117.5,728.5,3456] for i in range(len(tabfile)): self.assertEqual(tabfile[i]['midpoint'],results[i])
def test_reorder_columns(self): """Reorder columns in a TabFile """ tabfile = TabFile('test',self.fp,first_line_is_header=True) # Check number of columns and header items self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.header(),['chr','start','end','data']) # Reorder new_columns = ['chr','data','start','end'] tabfile = tabfile.reorderColumns(new_columns) self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.header(),new_columns) self.assertEqual(str(tabfile[0]),"chr1\t4.6\t1\t234") self.assertEqual(str(tabfile[1]),"chr1\t5.7\t567\t890") self.assertEqual(str(tabfile[2]),"chr2\t6.8\t1234\t5678")
def test_reorder_columns_empty_cells(self): """Reorder columns where some lines have empty cells at the start """ tabfile = TabFile('test',self.fp,first_line_is_header=True) # Check number of columns and header items self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.header(),['chr','start','end','data']) # Reset some cells to empty tabfile[0]['chr'] = '' tabfile[2]['chr'] = '' # Reorder new_columns = ['chr','data','start','end'] tabfile = tabfile.reorderColumns(new_columns) self.assertEqual(tabfile.nColumns(),4) self.assertEqual(tabfile.header(),new_columns) self.assertEqual(str(tabfile[0]),"\t4.6\t1\t234") self.assertEqual(str(tabfile[1]),"chr1\t5.7\t567\t890") self.assertEqual(str(tabfile[2]),"\t6.8\t1234\t5678")
def annotate_htseq_count_data(gff_lookup,htseq_files,out_file): """Annotate count data from htseq-count output with gene information Reads in data from one or more htseq-count output files and combines into a single tab-delimited output file where the counts for each feature have been appended to data about the parent gene. Also creates an output 'stats' file which combines the summary data from the tail of each htseq-count file. Arguments: gff_lookup: populated GFFAnnotationLookup instance htseq_files: list of output files from htseq-count to use as input out_file: name of output file """ # Output files annotated_counts_out_file = out_file tables_out_file = \ os.path.join(os.path.dirname(annotated_counts_out_file), os.path.splitext(os.path.basename(annotated_counts_out_file))[0]+\ "_stats"+os.path.splitext(annotated_counts_out_file)[1]) # Process the HTSeq-count files print "Processing HTSeq-count files" htseq_data = {} for htseqfile in htseq_files: print "\t%s" % htseqfile htseq_data[htseqfile] = HTSeqCountFile(htseqfile) # Create a TabFile for output print "Building annotated count file for output" annotated_counts = TabFile.TabFile(column_names=['exon_parent', 'feature_type_exon_parent', 'gene_ID', 'gene_name', 'chr', 'start', 'end', 'strand', 'gene_length', 'locus', 'description']) for htseqfile in htseq_files: annotated_counts.appendColumn(htseqfile) # Combine feature counts and parent feature data for feature_ID in htseq_data[htseq_files[0]].feature_IDs(): # Get annotation data annotation = gff_lookup.getAnnotation(feature_ID) # Build the data line data = [annotation.parent_feature_name, annotation.parent_feature_type, annotation.parent_feature_parent, annotation.parent_gene_name, annotation.chr, annotation.start, annotation.end, annotation.strand, annotation.gene_length, annotation.gene_locus, annotation.description] # Add the counts from each file for htseqfile in htseq_files: data.append(htseq_data[htseqfile].count(feature_ID)) # Add to the tabfile annotated_counts.append(data=data) # Write the file print "Writing output file %s" % annotated_counts_out_file annotated_counts.write(annotated_counts_out_file,include_header=True,no_hash=True) # Make second file for the trailing table data print "Building trailing tables data file for output" table_counts = TabFile.TabFile(column_names=['count']) for htseqfile in htseq_files: table_counts.appendColumn(htseqfile) for name in htseq_data[htseq_files[0]].table(): # Build the data line data = [name] for htseqfile in htseq_files: data.append(htseq_data[htseqfile].table()[name]) table_counts.append(data=data) print "Writing output file %s" % tables_out_file table_counts.write(tables_out_file,include_header=True,no_hash=True)