示例#1
0
    def __init__(self, **keywords):
        """
        dataMatrixDtype could be a compound type:
            http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
            http://docs.scipy.org/doc/numpy/reference/generated/numpy.dtype.html
                
        #A record data type containing a 16-character string (in field name)
            #and a sub-array of two 64-bit floating-point number (in field grades):
        dt = numpy.dtype([('name', numpy.str_, 16), ('grades', numpy.float64, (2,))])
        
        my_dtype = numpy.dtype([('field1', 'i'), ('field2', 'f'), ('field3', varLenStrType)])
        
        #Using array-protocol type strings:
        #each number below is counting bytes, not bits
        >>> numpy.dtype([('a','f8'),('b','S10')])
        dtype([('a', '<f8'), ('b', '|S10')])
        
        #Using tuples. int is a fixed type, 3 the field's shape. void is a flexible type, here of size 10:
        numpy.dtype([('hello',(numpy.int,3)),('world',numpy.void,10)])
        
        #Using dictionaries. Two fields named 'gender' and 'age':
        numpy.dtype({'names':['gender','age'], 'formats':['S1',numpy.uint8]})
        
        #Offsets in bytes, here 0 and 25:
        numpy.dtype({'surname':('S25',0),'age':(numpy.uint8,25)})
        """
        self.ad = ProcessOptions.process_function_arguments(
            keywords,
            self.option_default_dict,
            error_doc=self.__doc__,
            class_to_have_attr=self)

        self.dataMatrixDSName = "dataMatrix"
        self.rowIDListDSName = "rowIDList"
        self.colIDListDSName = "colIDList"
        if not self.newGroup:
            self._readInData()
        else:
            self._createDatasetSkeletonForOneGroup(h5Group=self.h5Group,
                                                   dtype=self.dataMatrixDtype)

        self.newWrite = True
        #a flag used to control whether it's first time to write stuff (first time=set whole matrix)
        self.rowIndexCursor = 0
示例#2
0
 def __init__(self, path=None, **keywords):
     self.ad = ProcessOptions.process_function_arguments(keywords,
         self.option_default_dict, error_doc=self.__doc__,
         class_to_have_attr=self)
     if not self.path:
         self.path = path
     
     if self.path and self.file_handle is None:
         self.file_handle = utils.openGzipFile(self.path, mode=self.mode)
     
     #2013.05.03 for easy access
     self.filename = self.path		
     self.csvFile = None
     self.isRealCSV = False
     if self.mode=='r':	#reading mode
         if self.delimiter is None:
             self.delimiter = figureOutDelimiter(self.file_handle)
         
         if self.delimiter=='\t' or self.delimiter==',':
             self.csvFile = csv.reader(self.file_handle, delimiter=self.delimiter)
             self.isRealCSV = True
         else:
             self.csvFile = self.file_handle
             self.isRealCSV = False
     else:	#writing mode
         if not self.delimiter:
             self.delimiter = '\t'
         self.csvFile = csv.writer(self.file_handle, delimiter=self.delimiter)
         self.isRealCSV = True
         #else:
         #	self.csvFile = self.file_handle
         #	self.isRealCSV = False
     self.col_name2index = None
     
     self._row = None	# store the current row being read
     self.headerPattern = re.compile(r'^[a-zA-Z]')
     #default header pattern, line beginned with letter
     self.commentPattern = re.compile(r'^#')	#default, beginned with #
     self.comment_row_list  = []
示例#3
0
    def __init__(self, inputFname=None, **keywords):
        self.ad = ProcessOptions.process_function_arguments(keywords,
            self.option_default_dict, error_doc=self.__doc__, \
            class_to_have_attr=self)
        if not self.inputFname:
            self.inputFname = inputFname

        self.header = None
        self.combinedColIDList = None  #same as header
        self.combinedColID2ColIndex = None

        self.hdf5File = h5py.File(self.inputFname, self.mode)
        self.tableObjectList = []
        self.tablePath2Index = {}

        if self.mode == 'r':
            self._readInData()
        elif self.mode == 'w':
            self.createNewTable(tableName=self.tableName,
                                dtype=self.dtype,
                                rowDefinition=self.rowDefinition)

        self.rowIndexCursor = 0  #2012.11.16 for iteration
示例#4
0
    def __init__(self, **keywords):
        """
        2012.9.5 set default minDepth=0
        2011-9-27
        """
        self.ad = ProcessOptions.process_function_arguments(keywords,
            self.option_default_dict, error_doc=self.__doc__, \
            class_to_have_attr=self)

        self.header = None
        # the list of column headers (the header line starting by #CHROM)
        self.headerWithoutHash = None
        # same as self.header except, instead of "#CHROM", it is "CHROM".
        self.sample_id_ls = []
        self.sample_id2index = {}
        #the index is the index of its column in the genotype_call_matrix
        self.locus_id_ls = []
        self.locus_id2row_index = {}
        self.locus_id2data = {}
        self.genotype_call_matrix = []
        self.col_name2index = {}  #column index in file
        self.col_index_individual_name_ls = None
        self.individual_name2col_index = {}
        #not the matrix column, the column in input file
        self.metaInfoLs = []
        # anything before the "#CHROM" line. each entry is a raw line content, including '\n'
        self.sampleIDHeader = []
        # a list of sample column headers (from sampleStartingColumn)

        self.inf = None
        self.reader = None
        self._initializeInput(self.inputFname)

        self.outf = None
        self.writer = None
        self._initializeOutput(self.outputFname)
示例#5
0
                    raise
            else:
                if key not in dc:
                    dc[key] = []
            
            if len(valueColumnIndexList)>1:
                value = valueList
            else:
                value = valueList[0]
            if keyUniqueInInputFile:
                dc[key] = value
            else:
                dc[key].append(value)
        sys.stderr.write("%s unique pairs from %s rows.\n"%(len(dc), counter))
        return dc
    
    
    def run(self):
        """
        """
        
        if self.debug:
            import pdb
            pdb.set_trace()
        

if __name__ == '__main__':
    main_class = MatrixFile
    po = ProcessOptions(sys.argv, main_class.option_default_dict, error_doc=main_class.__doc__)
    instance = main_class(**po.long_option2value)
    instance.run()