def parse_body(self, fd): """ Parse the body of the stream, i.e. the data part. """ # Used compiled regular expressions (cr_): # cr_trim used to remove comments, linebreaks, whitespace, ... # cr_split used to split the remaining row into its fields # cr_comment used to identify a comment-only line cr_trim = re.compile('^\s*(.*?)(#.*)?$') #cr_split is set below, after the delimiter has been determined cr_comment = re.compile('^\s*(#.*)?$') # skip comments line = '#' while len(line) > 0 and cr_comment.match(line) is not None: rewind = fd.tell() line = fd.readline() fd.seek(rewind) # determine delimiter delimiter = self.delimiter or self.custom_delimiter if delimiter is None: # determine from first non-comment line rewind = fd.tell() line = fd.readline() if line.find(',') != -1: delimiter = ',' else: delimiter = '[\s\t]*' fd.seek(rewind) logger.debug("determined delimiter: %s" % delimiter) # If a table or a list of designations is given, then we will # skip the column count determination and the creation of a # new table. if self.table is None: # determine optional arguments typecodes = self.typecodes ncols = self.ncols # if no column count is given, try to # determine nr. of ncols from first line if ncols is None: rewind = fd.tell() line = fd.readline() # split off comments try: line = cr_trim.match(line).groups()[0] except AttributeError: ncols = 2 cregexp = re.compile(delimiter) matches = [match for match in cregexp.split(line) if len(match) > 0] logger.debug("MATCHES = %s" % str(matches)) ncols = len(matches) fd.seek(rewind) # create new Table tbl = Table(nrows=self.growth_offset, ncols=ncols, typecodes=typecodes) else: tbl = self.table logger.debug("# of columns to be expected: %d" % tbl.ncols) # make sure existing Table has at least one entry. if tbl.nrows == 0: tbl.resize(1) iter = tbl.row(0) converters = tbl.converters # assign column information from keyword arguments 'keys' & 'label' keys = self.keys labels = self.labels if keys: n = 0 for column in tbl.get_columns(): column.key = keys[n] n +=1 if labels: n = 0 for column in tbl.get_columns(): column.label = labels[n] n += 1 # designations designations = self.designations if designations.find('|') != -1: designations, repeat_pattern = designations.split('|') else: repeat_pattern = designations while len(designations) < tbl.ncols: designations += repeat_pattern logger.debug("Column designations: %s" % designations) n = 0 for column in tbl.get_columns(): column.designation = designations[n] n += 1 cr_split = re.compile(delimiter) # # read in file line by line # logger.debug("Start reading ASCII file.") skipcount = 0 row = fd.readline() while len(row) > 0: # Split off comments using a regular expression. # This is a more robust solution than the former # row = row.split('#')[0] # TODO: Be careful when we have string fields, then a # # might not be what it looks like -- it might be contained # in quotes! try: row = cr_trim.match(row).groups()[0] except AttributeError: logger.error("Skipped row: %s" % row) row = fd.readline() continue matches = [match for match in cr_split.split(row) if len(match) > 0] #logger.debug("MATCHES = %s" % str(matches)) if len(matches) == 0: skipcount += 1 if skipcount > 100: # TODO: implement question! #Signals.emit("ask-for-confirmation", "Warning: More than 100 lines skipped recently. Should we continue with this file?") skipcount = 0 else: try: values = map(lambda x, c: c(x), matches, converters) except ValueError, msg: #logger.warn("Skipped: %s (%s)" % (row,msg)) row = fd.readline() continue except TypeError, msg: #logger.warn("Skipped: %s (%s)" % (row,msg)) row = fd.readline() continue else: #logger.info("Read %s" % values) pass iter.set( values ) # Move to next row. # If this is the last row, then the Table is extended. try: iter = iter.next() except StopIteration: tbl.extend(tbl.ncols+self.growth_offset) iter = iter.next()
def read_table_from_stream(self, fd): # determine optional arguments typecodes = self.typecodes ncols = self.ncols # skip header lines if requested header_lines = self.header_lines while header_lines > 0: line = fd.readline() header_lines -= 1 # TODO: use given expression and re # skip comments line = '#' while len(line) > 0 and line[0] == '#': rewind = fd.tell() line = fd.readline() fd.seek(rewind) # determine delimiter print "<===", self.delimiter, self.header_lines delimiter = self.delimiter or self.custom_delimiter if delimiter is None: # determine from first non-comment line rewind = fd.tell() line = fd.readline() if line.find(',') != -1: delimiter = ',' else: delimiter = '[\s\t]*' fd.seek(rewind) logging.debug("determined delimiter: %s" % delimiter) # If a table or a list of designations is given, then we will # skip the column count determination and the creation of a # new table. if self.table is None: # if no column count is given, try to # determine nr. of ncols from first line if ncols is None: rewind = fd.tell() line = fd.readline() ncols = len(line.split(delimiter)) fd.seek(rewind) logger.debug("# of columns to be expected: %d" % ncols) # create new Table tbl = Table(nrows=self.growth_offset, ncols=ncols, typecodes=typecodes) else: tbl = self.table # make sure existing Table has at least one entry. if tbl.nrows == 0: tbl.resize(1) iter = tbl.row(0) converters = tbl.converters # assign column information from keyword arguments 'keys' & 'label' keys = self.keys labels = self.labels if keys: n = 0 for column in tbl.get_columns(): column.key = keys[n] n +=1 if labels: n = 0 for column in tbl.get_columns(): column.label = labels[n] n += 1 # use given designation or if none given, alternate column # designations X/Y. designations = self.designations if designations is None: designations = [('X','Y')[i%2] for i in range(tbl.ncols)] n = 0 for column in tbl.get_columns(): column.designation = designations[n] n += 1 # # Create regular expression used to match the lines. # expmap = {'number' : '([-+]?[\d.]+)', 'string' : '(\".*?\")', 'eol' :'\s*(?:\#+.*)?$', 'bol' : '\s*', 'delimiter' : delimiter} tcmap = {'d' : expmap['number'], 'f' : expmap['number']} if len(typecodes) > 1: regexp = [tcmap[tc] for tc in typecodes] else: regexp = [tcmap[typecodes] for n in range(ncols)] regexp = expmap['bol'] + expmap['delimiter'].join(regexp) + expmap['eol'] cregexp = re.compile(regexp) logger.info("Regular Expression is: %s" % regexp) # # read in file line by line # skipcount = 0 row = fd.readline() while len(row) > 0: matches = cregexp.match(row) if matches is None: skipcount += 1 if skipcount > 100: Signals.emit("ask-for-confirmation", "Warning: More than 100 lines skipped recently. Should we continue with this file?") skipcount = 0 else: try: values = map(lambda x, c: c(x), matches.groups(), converters) except ValueError, msg: #logger.warn("Skipped: %s (%s)" % (row,msg)) row = fd.readline() continue except TypeError, msg: #logger.warn("Skipped: %s (%s)" % (row,msg)) row = fd.readline() continue else: #logger.info("Read %s" % values) pass iter.set( values ) # Move to next row. # If this is the last row, then the Table is extended. try: iter = iter.next() except StopIteration: tbl.extend(tbl.ncols+self.growth_offset) iter = iter.next()
def read_table_from_stream(self, fd): # determine optional arguments typecodes = self.typecodes ncols = self.ncols # skip header lines if requested header_lines = self.header_lines while header_lines > 0: line = fd.readline() header_lines -= 1 # TODO: use given expression and re # skip comments line = '#' while len(line) > 0 and line[0] == '#': rewind = fd.tell() line = fd.readline() fd.seek(rewind) # determine delimiter delimiter = self.delimiter or self.custom_delimiter if delimiter is None: # determine from first non-comment line rewind = fd.tell() line = fd.readline() if line.find(',') != -1: delimiter = ',' else: delimiter = '[\s\t]*' fd.seek(rewind) logger.debug("determined delimiter: %s" % delimiter) # If a table or a list of designations is given, then we will # skip the column count determination and the creation of a # new table. if self.table is None: # if no column count is given, try to # determine nr. of ncols from first line if ncols is None: rewind = fd.tell() line = fd.readline() # split off comments # TODO: This will not work for text entries "Example #Test" line = line.split('#')[0] cregexp = re.compile(delimiter) matches = [match for match in cregexp.split(line) if len(match) > 0] logger.debug("MATCHES = %s" % str(matches)) ncols = len(matches) fd.seek(rewind) # create new Table tbl = Table(nrows=self.growth_offset, ncols=ncols, typecodes=typecodes) else: tbl = self.table logger.debug("# of columns to be expected: %d" % ncols) # make sure existing Table has at least one entry. if tbl.nrows == 0: tbl.resize(1) iter = tbl.row(0) converters = tbl.converters # assign column information from keyword arguments 'keys' & 'label' keys = self.keys labels = self.labels if keys: n = 0 for column in tbl.get_columns(): column.key = keys[n] n +=1 if labels: n = 0 for column in tbl.get_columns(): column.label = labels[n] n += 1 # use given designation or if none given, alternate column # designations X/Y. designations = self.designations if designations is None: designations = [('X','Y')[i%2] for i in range(tbl.ncols)] n = 0 for column in tbl.get_columns(): column.designation = designations[n] n += 1 # Create regular expression used to match the lines. cregexp = re.compile(delimiter) # # read in file line by line # logger.debug("Start reading ASCII file.") skipcount = 0 row = fd.readline() while len(row) > 0: # split off comments # TODO: This will not work for text entries "Example #Test" row = row.split('#')[0] matches = [match for match in cregexp.split(row) if len(match) > 0] logger.debug("MATCHES = %s" % str(matches)) if len(matches) == 0: skipcount += 1 if skipcount > 100: Signals.emit("ask-for-confirmation", "Warning: More than 100 lines skipped recently. Should we continue with this file?") skipcount = 0 else: try: values = map(lambda x, c: c(x), matches, converters) except ValueError, msg: #logger.warn("Skipped: %s (%s)" % (row,msg)) row = fd.readline() continue except TypeError, msg: #logger.warn("Skipped: %s (%s)" % (row,msg)) row = fd.readline() continue else: #logger.info("Read %s" % values) pass iter.set( values ) # Move to next row. # If this is the last row, then the Table is extended. try: iter = iter.next() except StopIteration: tbl.extend(tbl.ncols+self.growth_offset) iter = iter.next()