def __init__(self, **keywords): """ dataMatrixDtype could be a compound type: http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html http://docs.scipy.org/doc/numpy/reference/generated/numpy.dtype.html #A record data type containing a 16-character string (in field name) #and a sub-array of two 64-bit floating-point number (in field grades): dt = numpy.dtype([('name', numpy.str_, 16), ('grades', numpy.float64, (2,))]) my_dtype = numpy.dtype([('field1', 'i'), ('field2', 'f'), ('field3', varLenStrType)]) #Using array-protocol type strings: #each number below is counting bytes, not bits >>> numpy.dtype([('a','f8'),('b','S10')]) dtype([('a', '<f8'), ('b', '|S10')]) #Using tuples. int is a fixed type, 3 the field's shape. void is a flexible type, here of size 10: numpy.dtype([('hello',(numpy.int,3)),('world',numpy.void,10)]) #Using dictionaries. Two fields named 'gender' and 'age': numpy.dtype({'names':['gender','age'], 'formats':['S1',numpy.uint8]}) #Offsets in bytes, here 0 and 25: numpy.dtype({'surname':('S25',0),'age':(numpy.uint8,25)}) """ self.ad = ProcessOptions.process_function_arguments( keywords, self.option_default_dict, error_doc=self.__doc__, class_to_have_attr=self) self.dataMatrixDSName = "dataMatrix" self.rowIDListDSName = "rowIDList" self.colIDListDSName = "colIDList" if not self.newGroup: self._readInData() else: self._createDatasetSkeletonForOneGroup(h5Group=self.h5Group, dtype=self.dataMatrixDtype) self.newWrite = True #a flag used to control whether it's first time to write stuff (first time=set whole matrix) self.rowIndexCursor = 0
def __init__(self, path=None, **keywords): self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, class_to_have_attr=self) if not self.path: self.path = path if self.path and self.file_handle is None: self.file_handle = utils.openGzipFile(self.path, mode=self.mode) #2013.05.03 for easy access self.filename = self.path self.csvFile = None self.isRealCSV = False if self.mode=='r': #reading mode if self.delimiter is None: self.delimiter = figureOutDelimiter(self.file_handle) if self.delimiter=='\t' or self.delimiter==',': self.csvFile = csv.reader(self.file_handle, delimiter=self.delimiter) self.isRealCSV = True else: self.csvFile = self.file_handle self.isRealCSV = False else: #writing mode if not self.delimiter: self.delimiter = '\t' self.csvFile = csv.writer(self.file_handle, delimiter=self.delimiter) self.isRealCSV = True #else: # self.csvFile = self.file_handle # self.isRealCSV = False self.col_name2index = None self._row = None # store the current row being read self.headerPattern = re.compile(r'^[a-zA-Z]') #default header pattern, line beginned with letter self.commentPattern = re.compile(r'^#') #default, beginned with # self.comment_row_list = []
def __init__(self, inputFname=None, **keywords): self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, \ class_to_have_attr=self) if not self.inputFname: self.inputFname = inputFname self.header = None self.combinedColIDList = None #same as header self.combinedColID2ColIndex = None self.hdf5File = h5py.File(self.inputFname, self.mode) self.tableObjectList = [] self.tablePath2Index = {} if self.mode == 'r': self._readInData() elif self.mode == 'w': self.createNewTable(tableName=self.tableName, dtype=self.dtype, rowDefinition=self.rowDefinition) self.rowIndexCursor = 0 #2012.11.16 for iteration
def __init__(self, **keywords): """ 2012.9.5 set default minDepth=0 2011-9-27 """ self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, \ class_to_have_attr=self) self.header = None # the list of column headers (the header line starting by #CHROM) self.headerWithoutHash = None # same as self.header except, instead of "#CHROM", it is "CHROM". self.sample_id_ls = [] self.sample_id2index = {} #the index is the index of its column in the genotype_call_matrix self.locus_id_ls = [] self.locus_id2row_index = {} self.locus_id2data = {} self.genotype_call_matrix = [] self.col_name2index = {} #column index in file self.col_index_individual_name_ls = None self.individual_name2col_index = {} #not the matrix column, the column in input file self.metaInfoLs = [] # anything before the "#CHROM" line. each entry is a raw line content, including '\n' self.sampleIDHeader = [] # a list of sample column headers (from sampleStartingColumn) self.inf = None self.reader = None self._initializeInput(self.inputFname) self.outf = None self.writer = None self._initializeOutput(self.outputFname)
raise else: if key not in dc: dc[key] = [] if len(valueColumnIndexList)>1: value = valueList else: value = valueList[0] if keyUniqueInInputFile: dc[key] = value else: dc[key].append(value) sys.stderr.write("%s unique pairs from %s rows.\n"%(len(dc), counter)) return dc def run(self): """ """ if self.debug: import pdb pdb.set_trace() if __name__ == '__main__': main_class = MatrixFile po = ProcessOptions(sys.argv, main_class.option_default_dict, error_doc=main_class.__doc__) instance = main_class(**po.long_option2value) instance.run()