def fixup(self): """ Trim off extra cells to right of # symbols and make sure every row is of the same length. """ maxwidth = 0 # Find max width and trim off unwanted crap for i, cells in enumerate(self.rows): try: endat = cells.index('#') # find first # (row ender) in any if endat == 0: raise FileError("Blueprint has '#' in unexpected cell.") else: # trim off stuff from the found # to the right cells = cells[0:endat] except: # trim off empty cells at end of row while cells and cells[-1] == '': cells = cells[:-1] endat = len(cells) self.rows[i] = cells # update maxwidth maxwidth = max(maxwidth, endat) if maxwidth == 0: raise FileError("Blueprint appears to be empty or zero-width.") # Conform all rows to the same width for row in self.rows: if len(row) < maxwidth: row.extend(['' for x in range(maxwidth - len(row))]) return
def read_sheet(filename, sheetid): """ Read ths specified sheet from the specified file. CSV files are considered a single sheet. """ # verify file exists if not os.path.isfile(filename): raise FileError('File not found "%s"' % filename) # read contents of the file into lines ext = os.path.splitext(filename)[1].lower() if ext == '.csv': lines = read_csv_file(filename) elif ext == '.xls': lines = xls.read_xls_file(filename, sheetid) elif ext == '.xlsx': lines = xlsx.read_xlsx_file(filename, sheetid) else: raise FileError("Invalid file type '%s' (csv, xls, xlsx accepted)" \ % ext) # if there's a line that starts with #, treat it as the last line of # the blueprint and trim off everything from there to the end of lines for i, line in enumerate(lines): if line and line[0] == '#': lines = lines[0:i] return lines
def read_xlsx_file(filename, sheetid): """ Read contents of specified sheet in Excel 2007 (.xlsx) workbook file. .xlsx files are actually zip files containing xml files. Returns a 2d list of cell values. """ if sheetid is None: sheetid = 1 else: sheetid += 1 # sheets are numbered starting from 1 in xlsx files # Get cell data from specified worksheet. try: zf = zipfile.ZipFile(filename) sheetdata = zf.read('xl/worksheets/sheet%s.xml' % sheetid) xml = xml2obj(sheetdata) rows = xml.sheetData.row except: raise FileError("Could not read xlsx file %s, worksheet id %s" % ( filename, sheetid - 1)) # Get shared strings xml. Cell values are given as ordinal index # references into sharedStrings.xml:ssi.si elements, whose string-value # is found in the node's .t element. try: stringdata = zf.read('xl/sharedStrings.xml') xml = xml2obj(stringdata) strings = xml.si except: raise FileError("Could not parse sharedStrings.xml of xlsx file") # Map strings to row values and return result return extract_xlsx_lines(rows, strings)
def load_aliases(filename): """ Loads aliases.txt-formatted file and returns a dict. Accepted formats: aliasname,keystrokes (QF1.x style) aliasname:keystrokes (QF2.x style) """ aliases = {} # load the file contents try: with open(filename) as f: data = f.read() except: raise FileError("Could not open aliases file " + filename) data = util.convert_line_endings(data) lines = data.split('\n') # strip out comment and empty lines lines = [line for line in lines if line != '' and line[0] != '#'] # break into {aliasname:keystrokes} pairs for line in lines: match = re.match(r'([\w\d]+)(,|:) *(.+)\s*\n*', line) if match is not None: aliases[match.group(1)] = match.group(3) return aliases
def parse_file(filename, sheetid): """ Parse the specified file/sheet into FileLayers and associated other bits of information. """ # read lines in lines = read_sheet(filename, sheetid) # raise error on completely empty sheets if (len(lines) == 0): raise FileError("Sheet is empty, file %s, worksheet id %s" % (filename, sheetid)) # break into the lines we want if lines[0][0] and lines[0][0][0] == '#': (top_line, lines) = (','.join(lines[0]), lines[1:]) else: # top line missing, assume #dig top_line = '#dig' # parse top line details details = parse_sheet_details(top_line) # break up lines into z-layers separated by #> or #< filelayers = split_zlayers(lines) # tidy up the layers for fl in filelayers: fl.fixup() fl.clean_cells() return filelayers, details
def read_xlsx_sheet_names(filename): """Get a list of sheets and their ids from xlsx file.""" try: zf = zipfile.ZipFile(filename) sheetsdata = zf.read('xl/workbook.xml') xml = xml2obj(sheetsdata) sheets = xml.sheets.sheet except: raise FileError("Could not open '%s' for sheet listing." % filename) output = [] for sheet in sheets: m = re.match('rId(\d+)', sheet.r_id) if not m: raise FileError("Could not read list of xlsx's worksheets.") output.append((sheet.name, int(m.group(1)) - 1)) return output
def __init__(self, supplied_binary='', tmpdir='/tmp'): if can_locate(supplied_binary): self.binary = supplied_binary else: default_binary = locate_file('gtp.jar', 'GTP_PATH', local_dir) self.binary = default_binary if self.binary is None: raise FileError(supplied_binary) self.tmpdir = tmpdir.rstrip('/')
def load_phyml_results( self, tree_file, stats_file, name=None, program='phyml', ): """ Loads phyml results into existing tree object - returns None """ exit = False for f in (tree_file, stats_file): try: if not os.path.isfile(f): raise FileError(f) except FileError, e: print e exit = True
def delete(filename): if can_locate(filename): return os.remove(filename) else: raise FileError(filename)
class Tree(object): """ Class for storing the results of phylogenetic inference """ score_regex = re.compile('(?<=Log-likelihood: ).+') name_regex = \ re.compile('([A-Za-z0-9\-_]+).([A-Za-z0-9\-_]+)(?=_phyml_)') def __init__( self, newick=None, score=0, program=None, name=None, output=None, rooted=None, ): self.newick = newick self.score = score self.program = program self.name = name self.output = output self.rooted = self.check_rooted(self.newick) def __str__(self): """ Represents the object's information inside a newick comment, so is still interpretable by a (good) newick parser """ s = '[Tree Object:\n' if self.name: s += 'Name:\t' + self.name + '\n' s += 'Program:\t{0}\n'.format(self.program) \ + 'Score:\t{0}\n'.format(self.score) \ + 'Rooted:\t{0}\n'.format(self.rooted) \ + 'Tree:\t]{0}\n'.format(self.newick) return s def __eq__(self, other): equal = True if not self.name == other.name: return False if not self.newick == other.newick: return False if not self.program == other.program: return False if not self.score == other.score: return False if not self.rooted == other.rooted: return False if not self.output == other.output: return False return equal def pam2sps(self, multiplier=0.01): """ Scales branch lengths by an order of `multiplier`. Default is 0.01, converting PAM units to substitutions per site. multiplier = 'sps2pam' scales by 100, performing the opposite operation. multiplier = 'strip' removes branch lengths entirely """ reg_ex = re.compile('(?<=:)[0-9.]+') converter = lambda a: str(multiplier * float(a.group())) strip_lengths = lambda d: '' input_string = self.newick if multiplier == 'pam2sps': multiplier = 0.01 elif multiplier == 'sps2pam': multiplier = 100 # Set the output string according to selection if multiplier == 'strip': output_string = reg_ex.sub(strip_lengths, input_string).replace(':', '') else: output_string = reg_ex.sub(converter, input_string) return Tree( output_string, self.score, self.program, self.name, self.output, self.rooted, ) def read_from_file(self, infile, name=None): """ This and the write_to_file function allow the class to be easily stored and reconstituted without using a pickle or JSON """ program = None tree = None score = None self.name = name reader = open(infile) try: for line in reader: line = [l.rstrip().replace(']', '') for l in line.split()] if not name and line[0] == 'Name:': self.name = line[1] elif line[0] == 'Program:': self.program = line[1] elif line[0] == 'Tree:': self.newick = line[1] elif line[0] == 'Score:': self.score = line[1] except IndexError: return return self def write_to_file( self, outfile, metadata=False, suppress_NHX=False, ): """ Writes a string representation of the object's contents to file. This can be read using read_from_file to reconstruct the Tree object, if metadata is included (i.e. metadata=True) """ writer = open(outfile, 'w') if metadata: writer.write(str(self)) else: writeable = self.newick if suppress_NHX: if writeable.startswith('[&R] '): writeable = writeable[5:] if not writeable.endswith('\n'): writeable += '\n' writer.write(writeable) writer.close() return outfile @classmethod def check_rooted(cls, newick): if newick is None: return None if newick == '': return None t = dpy.Tree() t.read_from_string(newick, 'newick') root_degree = len(t.seed_node.child_nodes()) return root_degree == 2 @classmethod def deroot_tree(cls, newick): t = dpy.Tree() t.read_from_string(newick, 'newick') t.deroot() return t.as_newick_string() + ';' def reroot_newick(self): dpy_tree = dpy.Tree() dpy_tree.read_from_string(self.newick, 'newick') dpy_tree.resolve_polytomies() newick_string = dpy_tree.as_newick_string() + ';\n' return newick_string def load_phyml_results( self, tree_file, stats_file, name=None, program='phyml', ): """ Loads phyml results into existing tree object - returns None """ exit = False for f in (tree_file, stats_file): try: if not os.path.isfile(f): raise FileError(f) except FileError, e: print e exit = True if exit: print 'Results were not loaded' raise FileError() if not name: name = self.name_regex.search(tree_file).group(1) newick = open(tree_file).read() stats = open(stats_file).read() score = float(self.score_regex.search(stats).group()) self.program = program self.newick = newick self.output = stats self.score = score self.name = name self.rooted = self.check_rooted(newick)