def string_to_records(string): """Convert a string of a fasta, genbank... into a simple ATGC string. Can also be used to detect a format. """ matches = re.match("([ATGC][ATGC]*)", string) # print("============", len(matches.groups()[0]), len(string)) # print (matches.groups()[0] == string) if (matches is not None) and (matches.groups()[0] == string): return [SeqRecord(Seq(string, DNAAlphabet()))], "ATGC" for fmt in ("fasta", "genbank"): if fmt == "genbank": string = fix_ice_genbank(string) try: stringio = StringIO(string) records = list(SeqIO.parse(stringio, fmt)) if len(records) > 0: return (records, fmt) except: pass try: record = snapgene_file_to_seqrecord(filecontent=StringIO(string)) return [record] except: pass raise ValueError("Invalid sequence format")
def load_record( filename, topology="auto", default_topology="linear", id="auto", upperize=True, ): if hasattr(filename, "read"): record = SeqIO.read(filename, "genbank") if id == "auto": raise ValueError("Can't have id == 'auto' when reading filelikes.") elif filename.lower().endswith(("gb", "gbk")): record = SeqIO.read(filename, "genbank") elif filename.lower().endswith(("fa", "fasta")): record = SeqIO.read(filename, "fasta") elif filename.lower().endswith(".dna"): record = snapgene_file_to_seqrecord(filename) else: raise ValueError("Unknown format for file: %s" % filename) if upperize: record = record.upper() if topology == "auto": set_record_topology(record, default_topology, pass_if_already_set=True) else: set_record_topology(record, topology) if id == "auto": id = record.id if id in [None, "", "<unknown id>", ".", " "]: id = os.path.splitext(os.path.basename(filename))[0] record.name = id.replace(" ", "_")[:20] record.id = id elif id is not None: record.id = id record.name = id.replace(" ", "_")[:20] return record
def get_description(self,file): self.__check_valid_file(file) extension = file.split('.')[-1] if extension == 'fasta': fastas = list(SeqIO.parse(file,'fasta')) if len(fastas) > 1: return 'Multiple line fastas not supported' else: desc = str(fastas[0].description) if extension == 'gb': gb_record = SeqIO.read(open(file, "r"), "genbank") desc = str(gb_record.description) if extension == 'dna': try: seq_record = snapgene_file_to_seqrecord(file) except: print('To read .dna files please install snapegenereader: pip install snapgene_reader - https://github.com/IsaacLuo/SnapGeneFileReader' ) desc = seq_record.description if extension == 'txt': desc = '<unknown description>' return desc
def string_to_record(string): """Convert a string of a fasta, genbank... into a simple ATGC string. Can also be used to detect a format. """ matches = re.match("([ATGC][ATGC]*)", string) # print("============", len(matches.groups()[0]), len(string)) # print (matches.groups()[0] == string) if (matches is not None) and (matches.groups()[0] == string): if has_dna_alphabet: # Biopython <1.78 sequence = Seq(string, alphabet=DNAAlphabet()) else: sequence = Seq(string) seqrecord = SeqRecord(sequence) seqrecord.annotations["molecule_type"] = "DNA" return seqrecord, "ATGC" for fmt in ("fasta", "genbank"): try: stringio = StringIO(string) records = list(SeqIO.parse(stringio, fmt)) if len(records) > 0: return (records, fmt) except Exception: pass try: record = snapgene_file_to_seqrecord(filecontent=StringIO(string)) return record except Exception: pass raise ValueError("Invalid sequence format")
def get_name(self,file): self.__check_valid_file(file) name = 'unknown' extension = file.split('.')[-1] if extension == 'fasta': fastas = list(SeqIO.parse(file,'fasta')) if len(fastas) > 1: return 'Multiple line fastas not supported' else: name = (str(fastas[0].id)) if extension == 'gb': gb_record = SeqIO.read(open(file, "r"), "genbank") name = str(gb_record.id) if extension == 'txt': name = self.__get_name_from_text(file) if extension == 'dna': try: seq_record = snapgene_file_to_seqrecord(file) except: print('To read .dna files please install snapegenereader: pip install snapgene_reader - https://github.com/IsaacLuo/SnapGeneFileReader' ) name = seq_record.name return name
def records_from_file(filepath): """Autodetect file format and load Biopython records from it.""" with open(filepath, "rb") as f: content = f.read() try: records, fmt = string_to_record(content.decode("utf-8")) except Exception: try: record = snapgene_file_to_seqrecord(fileobject=BytesIO(content)) records, fmt = [record], "snapgene" except Exception: try: parser = crazydoc.CrazydocParser( ["highlight_color", "bold", "underline"]) records = parser.parse_doc_file(BytesIO(content)) fmt = "doc" except Exception: try: df = spreadsheet_file_to_dataframe(filepath, header=None) records = [ sequence_to_biopython_record(sequence=seq, id=name, name=name) for name, seq in df.values ] fmt = "spreadsheet" except Exception: raise ValueError("Format not recognized for file " + filepath) if not isinstance(records, list): records = [records] return records, fmt
def get_sequence(self,file): ''' Return the sequence from several different file types Supported: .txt, .dna, .gb, .fasta ''' self.__check_valid_file(file) extension = file.split('.')[-1] if extension == 'dna': try: seq_record = snapgene_file_to_seqrecord(file) except: print('To read .dna files please install snapegenereader: pip install snapgene_reader - https://github.com/IsaacLuo/SnapGeneFileReader' ) sequence_str = str(seq_record.seq) if extension == 'txt': sequence_str = self.__get_seq_from_txt(file) if extension == 'gb': gb_record = SeqIO.read(open(file, "r"), "genbank") sequence_str = str(gb_record.seq) if extension == 'fasta': fastas = list(SeqIO.parse(file,'fasta')) if len(fastas) > 1: return 'Multiple line fastas not supported' else: sequence_str = (str(fastas[0].seq)) cleaned_sequence_str = self.clean_seq(sequence_str) return cleaned_sequence_str
def records_from_data_file(data_file): content = b64decode(data_file.content.split("base64,")[1]) try: records, fmt = string_to_records(content.decode("utf-8")) except: try: record = snapgene_file_to_seqrecord(fileobject=BytesIO(content)) records, fmt = [record], "snapgene" except: try: parser = crazydoc.CrazydocParser( ["highlight_color", "bold", "underline"]) records = parser.parse_doc_file(BytesIO(content)) fmt = "doc" except: try: df = spreadsheet_file_to_dataframe(data_file, header=None) records = [ sequence_to_biopython_record(sequence=seq, id=name, name=name) for name, seq in df.values ] fmt = "spreadsheet" except: raise ValueError("Format not recognized for file " + data_file.name) if not isinstance(records, list): records = [records] return records, fmt
def test_snapgene_file_to_seqrecord(tmpdir): all_files = [f for f in os.listdir(TEST_DIR) if f.endswith('.dna')] assert len(all_files) for fname in all_files: fpath = os.path.join(TEST_DIR, fname) record = snapgene_file_to_seqrecord(fpath) assert len(record.seq) > 10 target = os.path.join(str(tmpdir), fname + '.gb') with open(target, 'w', encoding='utf-8') as fwrite: SeqIO.write([record, ], fwrite, 'genbank')
def test_parse(tmpdir): all_files = [f for f in os.listdir(TEST_DIR) if f.endswith('.dna')] assert len(all_files) for fname in all_files: fpath = os.path.join(TEST_DIR, fname) record = snapgene_file_to_seqrecord(fpath) assert len(record.seq) > 10 with open(os.path.join(str(tmpdir), fname + '.gb'), 'w') as f: SeqIO.write([ record, ], f, 'genbank')
def load_record( filepath, topology="default_to_linear", id="auto", upperize=True, max_name_length=20, ): """Return a Biopython record read from a Fasta/Genbank/Snapgene file. Parameters ---------- filepath Path to a Genbank, Fasta, or Snapgene (.dna) file. topology Can be "circular", "linear", "default_to_circular" (will default to circular if ``annotations['topology']`` is not already set) or "default_to_linear". id Sets the record.id. If "auto", the original record.id is used, and if none is set the name of the file (without extension) is used instead. upperize If true, the sequence will get upperized (recommended in this library, as the mix of upper and lower case can cause problems in Biopython's enzyme site search). max_name_length The name of the record will be truncated if too long to avoid Biopython exceptions being raised. """ if filepath.lower().endswith(("gb", "gbk")): record = SeqIO.read(filepath, "genbank") elif filepath.lower().endswith(("fa", "fasta")): record = SeqIO.read(filepath, "fasta") elif filepath.lower().endswith(".dna"): record = snapgene_file_to_seqrecord(filepath) else: raise ValueError("Unknown format for file: %s" % filepath) if upperize: record = record.upper() set_record_topology(record, topology) if id == "auto": id = record.id if id in [None, "", "<unknown id>", ".", " "]: id = os.path.splitext(os.path.basename(filepath))[0] id = id.replace(" ", "_")[:max_name_length] record.id = id elif id is not None: record.id = id.replace(" ", "_")[:max_name_length] return record
def _load_records_from_zip_file(zip_file, use_file_names_as_ids=False): """Return all fasta/genbank/snapgene in a zip as biopython records. Each record gets a ``source_file`` attribute from the zip's file name without the .zip extension. Used via "load_records_from_files". """ zip_file = flametree.file_tree(zip_file) records = [] for f in zip_file._all_files: ext = f._extension.lower() if ext in ["gb", "gbk", "fa", "dna"]: try: new_records, fmt = string_to_records(f.read()) if not isinstance(new_records, list): new_records = [new_records] except Exception: content_stream = BytesIO(f.read("rb")) try: record = snapgene_file_to_seqrecord( fileobject=content_stream) new_records, _ = [record], "snapgene" except Exception: raise ValueError("Format not recognized for file " + f._path) single_record = len(new_records) == 1 for i, record in enumerate(new_records): name = record.id if name in [ None, "", "<unknown id>", ".", " ", "<unknown name>", ]: number = "" if single_record else ("%04d" % i) name = f._name_no_extension.replace(" ", "_") + number record.id = name record.id = name record.file_name = f._name_no_extension if use_file_names_as_ids and single_record: basename = os.path.basename(record.source_file) basename_no_extension = os.path.splitext(basename)[0] record.id = basename_no_extension for record in new_records: record.source_file = f._path records += new_records return records
def load_record(filename, linear=True, name="unnamed", capitalize=True): no_extension, extension = os.path.splitext(filename) fmt = formats_dict[extension] if fmt == "snapgene": record = snapgene_file_to_seqrecord(filename) else: record = SeqIO.read(filename, fmt) if capitalize: record.seq = record.seq.upper() record.linear = linear record.id = name record.name = name.replace(" ", "_")[:20] return record
def records_from_zip_file(zip_file, use_file_names_as_ids=False): zip_name = zip_file.name zip_file = flametree.file_tree(file_to_filelike_object(zip_file)) records = [] for f in zip_file._all_files: ext = f._extension.lower() if ext in ["gb", "gbk", "fa", "dna"]: try: new_records, fmt = string_to_records(f.read()) if not isinstance(new_records, list): new_records = [new_records] except: content_stream = BytesIO(f.read("rb")) try: record = snapgene_file_to_seqrecord( fileobject=content_stream) new_records, fmt = [record], "snapgene" except: try: parser = crazydoc.CrazydocParser( ["highlight_color", "bold", "underline"]) new_records = parser.parse_doc_file(content_stream) fmt = "doc" except: raise ValueError("Format not recognized for file " + f._path) single_record = len(new_records) == 1 for i, record in enumerate(new_records): name = record.id if name in [ None, "", "<unknown id>", ".", " ", "<unknown name>", ]: number = "" if single_record else ("%04d" % i) name = f._name_no_extension.replace(" ", "_") + number record.id = name record.name = name record.file_name = f._name_no_extension record.zip_file_name = zip_name if use_file_names_as_ids and single_record: basename = os.path.basename(record.file_name) basename_no_extension = os.path.splitext(basename)[0] record.id = basename_no_extension records += new_records return records
def load_records(path, capitalize=True): if isinstance(path, (list, tuple)): return [record for p in path for record in load_records(p)] no_extension, extension = os.path.splitext(path) fmt = formats_dict[extension] if fmt == "snapgene": records = [snapgene_file_to_seqrecord(path)] else: records = list(SeqIO.parse(path, fmt)) for i, record in enumerate(records): if capitalize: record.seq = record.seq.upper() if str(record.id) in ["None", "", "<unknown id>", ".", " "]: record.id = path.replace("/", "_").replace("\\", "_") if len(records) > 1: record.id += "_%04d" % i return records
def get_description(self, file_path): ''' Attempt to find the text description from a file Parameters ---------- file_path : str string path to file. Returns ------- str transcript description or "unknown". ''' self.__check_valid_file(file_path) extension = file_path.split('.')[-1] if extension == 'fasta': fastas = list(SeqIO.parse(file_path, 'fasta')) if len(fastas) > 1: return 'Multiple line fastas not supported' else: desc = str(fastas[0].description) if extension == 'gb': gb_record = SeqIO.read(open(file_path, "r"), "genbank") desc = str(gb_record.description) if extension == 'dna': try: seq_record = snapgene_file_to_seqrecord(file_path) except: msg = 'To read .dna files please install snapegenereader: '\ ' pip install snapgene_reader - '\ 'https://github.com/IsaacLuo/SnapGeneFileReader' raise SnapGeneMissingError(msg) desc = seq_record.description if extension == 'txt': desc = '<unknown description>' return desc
def load_records_from_file(filepath): """Autodetect file format and load biopython records from it.""" with open(filepath, "rb") as f: content = f.read() try: records, fmt = string_to_records(content.decode("utf-8")) except Exception: try: record = snapgene_file_to_seqrecord(fileobject=BytesIO(content)) records, fmt = [record], "snapgene" except Exception: raise ValueError("Format not recognized for file " + filepath) if not isinstance(records, list): records = [records] for record in records: record.source_file = filepath return records, fmt
def get_sequence(self, file): ''' get a nucleotide sequence Parameters ---------- file : path Path to the file to open. Returns ------- str mRNA sequence string. ''' self.__check_valid_file(file) extension = file.split('.')[-1] if extension == 'dna': try: seq_record = snapgene_file_to_seqrecord(file) except: msg = 'To read .dna files please install snapegenereader: '\ ' pip install snapgene_reader - '\ 'https://github.com/IsaacLuo/SnapGeneFileReader' raise SnapGeneMissingError(msg) sequence_str = str(seq_record.seq) if extension == 'txt': sequence_str = self.__get_seq_from_txt(file) if extension == 'gb': gb_record = SeqIO.read(open(file, "r"), "genbank") sequence_str = str(gb_record.seq) if extension == 'fasta': fastas = list(SeqIO.parse(file, 'fasta')) if len(fastas) > 1: return 'Multiple line fastas not supported' else: sequence_str = (str(fastas[0].seq)) cleaned_sequence_str = self.clean_seq(sequence_str) return cleaned_sequence_str
def records_from_zip_file(zip_file): """Return all fasta/genbank/snapgene in a zip as Biopython records.""" zip_file = flametree.file_tree(zip_file) records = [] for f in zip_file._all_files: ext = f._extension.lower() if ext in ["gb", "gbk", "fa", "dna"]: try: new_records, fmt = string_to_record(f.read()) except Exception: content_stream = BytesIO(f.read("rb")) try: record = snapgene_file_to_seqrecord( fileobject=content_stream) new_records, _ = [record], "snapgene" except Exception: try: parser = crazydoc.CrazydocParser( ["highlight_color", "bold", "underline"]) new_records = parser.parse_doc_file(content_stream) # fmt = "doc" except Exception: raise ValueError("Format not recognized for file " + f._path) single_record = len(new_records) == 1 for i, record in enumerate(new_records): name = record.id if name in [ None, "", "<unknown id>", ".", " ", "<unknown name>", ]: number = "" if single_record else ("%04d" % i) name = f._name_no_extension.replace(" ", "_") + number record.id = name record.name = name record.file_name = f._name_no_extension records += new_records return records
def load_records(path): if isinstance(path, (list, tuple)): return [record for p in path for record in load_records(p)] no_extension, extension = os.path.splitext(path) fmt = { '.fa': 'fasta', '.gb': 'genbank', '.gbk': 'genbank', '.dna': 'snapgene' }[extension] if fmt == 'snapgene': records = [snapgene_file_to_seqrecord(path)] else: records = list(SeqIO.parse(path, fmt)) for i, record in enumerate(records): if str(record.id) in ['None', '', "<unknown id>", '.', ' ']: record.id = path.replace("/", "_").replace("\\", "_") if len(records) > 1: record.id += "_%04d" % i return records
def records_from_data_file(data_file): content = b64decode(data_file.content.split("base64,")[1]) try: records, fmt = string_to_record(content.decode("utf-8")) except: try: record = snapgene_file_to_seqrecord(fileobject=BytesIO(content)) records, fmt = [record], 'snapgene' except: try: parser = crazydoc.CrazydocParser( ['highlight_color', 'bold', 'underline']) records = parser.parse_doc_file(BytesIO(content)) fmt = 'doc' except: raise ValueError("Format not recognized for file " + data_file.name) if not isinstance(records, list): records = [records] return records, fmt
def load_record(filepath, linear=True, name="unnamed", file_format="auto"): """Load a FASTA/Genbank/Snapgene record. Note that reading Snapgene records requires the library snapgene_reader installed. """ if file_format != "auto": record = SeqIO.read(filepath, file_format) elif filepath.lower().endswith(("gb", "gbk")): record = SeqIO.read(filepath, "genbank") elif filepath.lower().endswith(("fa", "fasta")): record = SeqIO.read(filepath, "fasta") elif filepath.lower().endswith(".dna"): record = snapgene_file_to_seqrecord(filepath) else: raise ValueError("Unknown format for file: %s" % filepath) record.linear = linear if name != "unnamed": record.id = name record.name = name.replace(" ", "_")[:20] return record
def load_record(filename, record_id="auto", upperize=False, id_cutoff=20): """Load a Fasta/Genbank/Snapgene file as a Biopython record. Parameters ========== filename Path to the file containing the record. record_id Id of the record (leave to "auto" to keep the record's original Id, which will default to the file name if the record has no Id). upperize If true, the record's sequence will be upperized. id_cutoff If the Id is read from a filename, it will get truncated at this cutoff to avoid errors at report write time. """ if filename.lower().endswith(("gb", "gbk")): record = SeqIO.read(filename, "genbank") elif filename.lower().endswith(("fa", "fasta")): record = SeqIO.read(filename, "fasta") elif filename.lower().endswith(".dna"): record = snapgene_file_to_seqrecord(filename) else: raise ValueError("Unknown format for file: %s" % filename) if upperize: record = record.upper() if record_id == "auto": record_id = record.id if record_id in [None, "", "<unknown id>", ".", " "]: record_id = os.path.splitext(os.path.basename(filename))[0] record.id = record_id record.name = record_id.replace(" ", "_")[:id_cutoff] record.id = record_id elif record_id is not None: record.id = record_id record.name = record_id.replace(" ", "_")[:id_cutoff] return record
def records_from_zip_file(zip_file): zip_file = flametree.file_tree(file_to_filelike_object(zip_file)) records = [] for f in zip_file._all_files: ext = f._extension.lower() if ext in ['gb', 'fa', 'dna']: try: new_records, fmt = string_to_record(f.read()) except: content_stream = BytesIO(f.read('rb')) try: record = snapgene_file_to_seqrecord( fileobject=content_stream) new_records, fmt = [record], 'snapgene' except: try: parser = crazydoc.CrazydocParser( ['highlight_color', 'bold', 'underline']) new_records = parser.parse_doc_file(content_stream) fmt = 'doc' except: raise ValueError("Format not recognized for file " + f._path) single_record = len(new_records) == 1 for i, record in enumerate(new_records): name = record.id if name in [ None, '', "<unknown id>", '.', ' ', "<unknown name>" ]: number = ('' if single_record else ("%04d" % i)) name = f._name_no_extension.replace(" ", "_") + number name = name.split(".")[0] record.id = name record.name = name record.file_name = f._name_no_extension records += new_records print([(r.name, r.id) for r in records]) return records
def string_to_records(string): """Convert a string of a fasta, genbank... into a simple ATGC string. Can also be used to detect a format. """ matches = re.match("([ATGC][ATGC]*)", string) if (matches is not None) and (matches.groups()[0] == string): return [sequence_to_biopython_record(string)], "ATGC" for fmt in ("fasta", "genbank"): try: stringio = StringIO(string) records = list(SeqIO.parse(stringio, fmt)) if len(records) > 0: return (records, fmt) except Exception: pass try: record = snapgene_file_to_seqrecord(filecontent=StringIO(string)) return [record] except Exception: pass raise ValueError("Invalid sequence format")
def load_record(filename, linear=True, id='auto', upperize=True): if filename.lower().endswith(("gb", "gbk")): record = SeqIO.read(filename, "genbank") elif filename.lower().endswith(('fa', 'fasta')): record = SeqIO.read(filename, "fasta") elif filename.lower().endswith('.dna'): record = snapgene_file_to_seqrecord(filename) else: raise ValueError('Unknown format for file: %s' % filename) if upperize: record = record.upper() record.linear = linear if id == 'auto': id = record.id if id in [None, '', "<unknown id>", '.', ' ']: id = os.path.splitext(os.path.basename(filename))[0] record.name = id.replace(" ", "_")[:20] record.id = id elif id is not None: record.id = id record.name = id.replace(" ", "_")[:20] return record
def get(fname): """ Gather all sequences from a file """ return snapgene_file_to_seqrecord(fname)
def load_record( record_file, topology="default_linear", id="auto", upperize=True, max_name_length=20, file_format=None, ): """Read a record (from many different input formats). Parameters ---------- record_file A genbank file, a fasta file, a snapgene file, or a filelike object (at which case the format, genbank or fasta, must be given with ``file_format``) topology Either "circular" or "linear" or "default_to_circular" (will default to circular if ``annotations['topology']`` is not already set) or id Will be used for the record ID and name. If auto, the record id will be unchanged unless it is ".", " ", etc. at which case it will be replaced by the file name. upperize If true, the sequence will get upperized. max_name_length The name of the record will be truncated if too long to avoid Biopython exceptions being raised. file_format Indicates the file format for the parser, when record_file is a filelike object. """ if file_format is not None: record = SeqIO.read(record_file, file_format) elif record_file.lower().endswith(("gb", "gbk")): record = SeqIO.read(record_file, "genbank") elif record_file.lower().endswith(("fa", "fasta")): record = SeqIO.read(record_file, "fasta") elif record_file.lower().endswith(".dna"): record = snapgene_file_to_seqrecord(record_file) else: raise ValueError("Unknown format for file: %s" % record_file) if upperize: record = record.upper() set_record_topology(record, topology) if id == "auto": id = record.id if id in [None, "", "<unknown id>", ".", " "]: id = os.path.splitext(os.path.basename(record_file))[0] record.name = id.replace(" ", "_")[:max_name_length] record.id = id elif id is not None: record.id = id record.name = id.replace(" ", "_")[:max_name_length] return record