예제 #1
0
파일: tools.py 프로젝트: jeqka24/CUBA
def string_to_records(string):
    """Convert a string of a fasta, genbank... into a simple ATGC string.

    Can also be used to detect a format.
    """
    matches = re.match("([ATGC][ATGC]*)", string)
    # print("============", len(matches.groups()[0]), len(string))
    # print (matches.groups()[0] == string)
    if (matches is not None) and (matches.groups()[0] == string):
        return [SeqRecord(Seq(string, DNAAlphabet()))], "ATGC"

    for fmt in ("fasta", "genbank"):
        if fmt == "genbank":
            string = fix_ice_genbank(string)
        try:
            stringio = StringIO(string)
            records = list(SeqIO.parse(stringio, fmt))
            if len(records) > 0:
                return (records, fmt)
        except:
            pass
    try:
        record = snapgene_file_to_seqrecord(filecontent=StringIO(string))
        return [record]
    except:
        pass
    raise ValueError("Invalid sequence format")
예제 #2
0
def load_record(
    filename,
    topology="auto",
    default_topology="linear",
    id="auto",
    upperize=True,
):
    if hasattr(filename, "read"):
        record = SeqIO.read(filename, "genbank")
        if id == "auto":
            raise ValueError("Can't have id == 'auto' when reading filelikes.")
    elif filename.lower().endswith(("gb", "gbk")):
        record = SeqIO.read(filename, "genbank")
    elif filename.lower().endswith(("fa", "fasta")):
        record = SeqIO.read(filename, "fasta")
    elif filename.lower().endswith(".dna"):
        record = snapgene_file_to_seqrecord(filename)
    else:
        raise ValueError("Unknown format for file: %s" % filename)
    if upperize:
        record = record.upper()
    if topology == "auto":
        set_record_topology(record, default_topology, pass_if_already_set=True)
    else:
        set_record_topology(record, topology)
    if id == "auto":
        id = record.id
        if id in [None, "", "<unknown id>", ".", " "]:
            id = os.path.splitext(os.path.basename(filename))[0]
            record.name = id.replace(" ", "_")[:20]
        record.id = id
    elif id is not None:
        record.id = id
        record.name = id.replace(" ", "_")[:20]
    return record
예제 #3
0
    def get_description(self,file):
        self.__check_valid_file(file)
        
        extension = file.split('.')[-1] 
        
        if extension == 'fasta':                  
            fastas = list(SeqIO.parse(file,'fasta'))
            if len(fastas) > 1:
                return 'Multiple line fastas not supported'
            else:
                
                desc = str(fastas[0].description)

        if extension == 'gb':                  
            gb_record = SeqIO.read(open(file, "r"), "genbank")
            desc = str(gb_record.description)   
            
        if extension == 'dna':
            try:
                seq_record = snapgene_file_to_seqrecord(file)
            except:
                print('To read .dna files please install snapegenereader: pip install snapgene_reader - https://github.com/IsaacLuo/SnapGeneFileReader' )
            desc = seq_record.description
        if extension == 'txt':
            desc = '<unknown description>'

        return desc
예제 #4
0
def string_to_record(string):
    """Convert a string of a fasta, genbank... into a simple ATGC string.

    Can also be used to detect a format.
    """
    matches = re.match("([ATGC][ATGC]*)", string)
    # print("============", len(matches.groups()[0]), len(string))
    # print (matches.groups()[0] == string)
    if (matches is not None) and (matches.groups()[0] == string):
        if has_dna_alphabet:  # Biopython <1.78
            sequence = Seq(string, alphabet=DNAAlphabet())
        else:
            sequence = Seq(string)
        seqrecord = SeqRecord(sequence)
        seqrecord.annotations["molecule_type"] = "DNA"

        return seqrecord, "ATGC"

    for fmt in ("fasta", "genbank"):
        try:
            stringio = StringIO(string)
            records = list(SeqIO.parse(stringio, fmt))
            if len(records) > 0:
                return (records, fmt)
        except Exception:
            pass
    try:
        record = snapgene_file_to_seqrecord(filecontent=StringIO(string))
        return record
    except Exception:
        pass
    raise ValueError("Invalid sequence format")
예제 #5
0
 def get_name(self,file):
     
     self.__check_valid_file(file)
     
     name = 'unknown'
     extension = file.split('.')[-1] 
     if extension == 'fasta':                  
         fastas = list(SeqIO.parse(file,'fasta'))
         if len(fastas) > 1:
             return 'Multiple line fastas not supported'
         else:
             name = (str(fastas[0].id))        
     
     if extension == 'gb': 
         gb_record = SeqIO.read(open(file, "r"), "genbank")
         name = str(gb_record.id)   
         
     if extension == 'txt':
         name = self.__get_name_from_text(file)
         
     if extension == 'dna':
         try:
             seq_record = snapgene_file_to_seqrecord(file)
         except:
             print('To read .dna files please install snapegenereader: pip install snapgene_reader - https://github.com/IsaacLuo/SnapGeneFileReader' )
         name = seq_record.name
         
     return name
예제 #6
0
def records_from_file(filepath):
    """Autodetect file format and load Biopython records from it."""

    with open(filepath, "rb") as f:
        content = f.read()
    try:
        records, fmt = string_to_record(content.decode("utf-8"))
    except Exception:
        try:
            record = snapgene_file_to_seqrecord(fileobject=BytesIO(content))
            records, fmt = [record], "snapgene"
        except Exception:
            try:
                parser = crazydoc.CrazydocParser(
                    ["highlight_color", "bold", "underline"])
                records = parser.parse_doc_file(BytesIO(content))
                fmt = "doc"
            except Exception:
                try:
                    df = spreadsheet_file_to_dataframe(filepath, header=None)
                    records = [
                        sequence_to_biopython_record(sequence=seq,
                                                     id=name,
                                                     name=name)
                        for name, seq in df.values
                    ]
                    fmt = "spreadsheet"
                except Exception:
                    raise ValueError("Format not recognized for file " +
                                     filepath)
    if not isinstance(records, list):
        records = [records]
    return records, fmt
예제 #7
0
    def get_sequence(self,file):
        '''
        Return the sequence from several different file types
        
        Supported: .txt, .dna, .gb, .fasta
        '''
        self.__check_valid_file(file)
        extension = file.split('.')[-1] 
        if extension == 'dna':            
            try:
                seq_record = snapgene_file_to_seqrecord(file)
            except:
                print('To read .dna files please install snapegenereader: pip install snapgene_reader - https://github.com/IsaacLuo/SnapGeneFileReader' )
            
            sequence_str = str(seq_record.seq)  

        if extension == 'txt':
            sequence_str =  self.__get_seq_from_txt(file)       
                    

        if extension == 'gb':
            gb_record = SeqIO.read(open(file, "r"), "genbank")
            sequence_str = str(gb_record.seq)

        if extension == 'fasta':
            fastas = list(SeqIO.parse(file,'fasta'))
            if len(fastas) > 1:
                return 'Multiple line fastas not supported'
            else:
                sequence_str = (str(fastas[0].seq))
        cleaned_sequence_str = self.clean_seq(sequence_str)
        
        return cleaned_sequence_str
예제 #8
0
파일: tools.py 프로젝트: jeqka24/CUBA
def records_from_data_file(data_file):
    content = b64decode(data_file.content.split("base64,")[1])
    try:
        records, fmt = string_to_records(content.decode("utf-8"))
    except:
        try:
            record = snapgene_file_to_seqrecord(fileobject=BytesIO(content))
            records, fmt = [record], "snapgene"
        except:
            try:
                parser = crazydoc.CrazydocParser(
                    ["highlight_color", "bold", "underline"])
                records = parser.parse_doc_file(BytesIO(content))
                fmt = "doc"
            except:
                try:
                    df = spreadsheet_file_to_dataframe(data_file, header=None)
                    records = [
                        sequence_to_biopython_record(sequence=seq,
                                                     id=name,
                                                     name=name)
                        for name, seq in df.values
                    ]
                    fmt = "spreadsheet"
                except:
                    raise ValueError("Format not recognized for file " +
                                     data_file.name)
    if not isinstance(records, list):
        records = [records]
    return records, fmt
예제 #9
0
def test_snapgene_file_to_seqrecord(tmpdir):
    all_files = [f for f in os.listdir(TEST_DIR) if f.endswith('.dna')]
    assert len(all_files)
    for fname in all_files:
        fpath = os.path.join(TEST_DIR, fname)
        record = snapgene_file_to_seqrecord(fpath)
        assert len(record.seq) > 10
        target = os.path.join(str(tmpdir), fname + '.gb')
        with open(target, 'w', encoding='utf-8') as fwrite:
            SeqIO.write([record, ], fwrite, 'genbank')
예제 #10
0
def test_parse(tmpdir):
    all_files = [f for f in os.listdir(TEST_DIR) if f.endswith('.dna')]
    assert len(all_files)
    for fname in all_files:
        fpath = os.path.join(TEST_DIR, fname)
        record = snapgene_file_to_seqrecord(fpath)
        assert len(record.seq) > 10
        with open(os.path.join(str(tmpdir), fname + '.gb'), 'w') as f:
            SeqIO.write([
                record,
            ], f, 'genbank')
예제 #11
0
def load_record(
    filepath,
    topology="default_to_linear",
    id="auto",
    upperize=True,
    max_name_length=20,
):
    """Return a Biopython record read from a Fasta/Genbank/Snapgene file.

    Parameters
    ----------

    filepath
      Path to a Genbank, Fasta, or Snapgene (.dna) file.

    topology
      Can be "circular", "linear", "default_to_circular" (will default
      to circular if ``annotations['topology']`` is not already set) or
      "default_to_linear".

    id
      Sets the record.id. If "auto", the original record.id is used, and if
      none is set the name of the file (without extension) is used instead.

    upperize
      If true, the sequence will get upperized (recommended in this library,
      as the mix of upper and lower case can cause problems in Biopython's
      enzyme site search).

    max_name_length
      The name of the record will be truncated if too long to avoid Biopython
      exceptions being raised.
    """
    if filepath.lower().endswith(("gb", "gbk")):
        record = SeqIO.read(filepath, "genbank")
    elif filepath.lower().endswith(("fa", "fasta")):
        record = SeqIO.read(filepath, "fasta")
    elif filepath.lower().endswith(".dna"):
        record = snapgene_file_to_seqrecord(filepath)
    else:
        raise ValueError("Unknown format for file: %s" % filepath)
    if upperize:
        record = record.upper()
    set_record_topology(record, topology)
    if id == "auto":
        id = record.id
        if id in [None, "", "<unknown id>", ".", " "]:
            id = os.path.splitext(os.path.basename(filepath))[0]
            id = id.replace(" ", "_")[:max_name_length]
        record.id = id
    elif id is not None:
        record.id = id.replace(" ", "_")[:max_name_length]

    return record
예제 #12
0
def _load_records_from_zip_file(zip_file, use_file_names_as_ids=False):
    """Return all fasta/genbank/snapgene in a zip as biopython records.

    Each record gets a ``source_file`` attribute from the zip's file name
    without the .zip extension.

    Used via "load_records_from_files".
    """
    zip_file = flametree.file_tree(zip_file)
    records = []
    for f in zip_file._all_files:
        ext = f._extension.lower()
        if ext in ["gb", "gbk", "fa", "dna"]:
            try:
                new_records, fmt = string_to_records(f.read())
                if not isinstance(new_records, list):
                    new_records = [new_records]
            except Exception:
                content_stream = BytesIO(f.read("rb"))
                try:
                    record = snapgene_file_to_seqrecord(
                        fileobject=content_stream)
                    new_records, _ = [record], "snapgene"
                except Exception:
                    raise ValueError("Format not recognized for file " +
                                     f._path)

            single_record = len(new_records) == 1
            for i, record in enumerate(new_records):
                name = record.id
                if name in [
                        None,
                        "",
                        "<unknown id>",
                        ".",
                        " ",
                        "<unknown name>",
                ]:
                    number = "" if single_record else ("%04d" % i)
                    name = f._name_no_extension.replace(" ", "_") + number
                record.id = name
                record.id = name
                record.file_name = f._name_no_extension
                if use_file_names_as_ids and single_record:
                    basename = os.path.basename(record.source_file)
                    basename_no_extension = os.path.splitext(basename)[0]
                    record.id = basename_no_extension
            for record in new_records:
                record.source_file = f._path
            records += new_records
    return records
예제 #13
0
def load_record(filename, linear=True, name="unnamed", capitalize=True):
    no_extension, extension = os.path.splitext(filename)
    fmt = formats_dict[extension]
    if fmt == "snapgene":
        record = snapgene_file_to_seqrecord(filename)
    else:
        record = SeqIO.read(filename, fmt)
    if capitalize:
        record.seq = record.seq.upper()
    record.linear = linear
    record.id = name
    record.name = name.replace(" ", "_")[:20]

    return record
예제 #14
0
파일: tools.py 프로젝트: jeqka24/CUBA
def records_from_zip_file(zip_file, use_file_names_as_ids=False):
    zip_name = zip_file.name
    zip_file = flametree.file_tree(file_to_filelike_object(zip_file))
    records = []
    for f in zip_file._all_files:
        ext = f._extension.lower()
        if ext in ["gb", "gbk", "fa", "dna"]:
            try:
                new_records, fmt = string_to_records(f.read())
                if not isinstance(new_records, list):
                    new_records = [new_records]
            except:
                content_stream = BytesIO(f.read("rb"))
                try:
                    record = snapgene_file_to_seqrecord(
                        fileobject=content_stream)
                    new_records, fmt = [record], "snapgene"
                except:
                    try:
                        parser = crazydoc.CrazydocParser(
                            ["highlight_color", "bold", "underline"])
                        new_records = parser.parse_doc_file(content_stream)
                        fmt = "doc"
                    except:
                        raise ValueError("Format not recognized for file " +
                                         f._path)

            single_record = len(new_records) == 1
            for i, record in enumerate(new_records):
                name = record.id
                if name in [
                        None,
                        "",
                        "<unknown id>",
                        ".",
                        " ",
                        "<unknown name>",
                ]:
                    number = "" if single_record else ("%04d" % i)
                    name = f._name_no_extension.replace(" ", "_") + number
                record.id = name
                record.name = name
                record.file_name = f._name_no_extension
                record.zip_file_name = zip_name
                if use_file_names_as_ids and single_record:
                    basename = os.path.basename(record.file_name)
                    basename_no_extension = os.path.splitext(basename)[0]
                    record.id = basename_no_extension
            records += new_records
    return records
예제 #15
0
def load_records(path, capitalize=True):
    if isinstance(path, (list, tuple)):
        return [record for p in path for record in load_records(p)]
    no_extension, extension = os.path.splitext(path)
    fmt = formats_dict[extension]
    if fmt == "snapgene":
        records = [snapgene_file_to_seqrecord(path)]
    else:
        records = list(SeqIO.parse(path, fmt))
    for i, record in enumerate(records):
        if capitalize:
            record.seq = record.seq.upper()
        if str(record.id) in ["None", "", "<unknown id>", ".", " "]:
            record.id = path.replace("/", "_").replace("\\", "_")
            if len(records) > 1:
                record.id += "_%04d" % i
    return records
예제 #16
0
    def get_description(self, file_path):
        '''
        Attempt to find the text description from a file

        Parameters
        ----------
        file_path : str
            string path to file.

        Returns
        -------
        str
            transcript description or "unknown".

        '''
        self.__check_valid_file(file_path)

        extension = file_path.split('.')[-1]

        if extension == 'fasta':
            fastas = list(SeqIO.parse(file_path, 'fasta'))
            if len(fastas) > 1:
                return 'Multiple line fastas not supported'
            else:

                desc = str(fastas[0].description)

        if extension == 'gb':
            gb_record = SeqIO.read(open(file_path, "r"), "genbank")
            desc = str(gb_record.description)

        if extension == 'dna':
            try:
                seq_record = snapgene_file_to_seqrecord(file_path)
            except:
                msg = 'To read .dna files please install snapegenereader: '\
                      ' pip install snapgene_reader - '\
                          'https://github.com/IsaacLuo/SnapGeneFileReader'
                raise SnapGeneMissingError(msg)

            desc = seq_record.description
        if extension == 'txt':
            desc = '<unknown description>'

        return desc
예제 #17
0
def load_records_from_file(filepath):
    """Autodetect file format and load biopython records from it."""

    with open(filepath, "rb") as f:
        content = f.read()
    try:
        records, fmt = string_to_records(content.decode("utf-8"))
    except Exception:
        try:
            record = snapgene_file_to_seqrecord(fileobject=BytesIO(content))
            records, fmt = [record], "snapgene"
        except Exception:
            raise ValueError("Format not recognized for file " + filepath)
    if not isinstance(records, list):
        records = [records]
    for record in records:
        record.source_file = filepath
    return records, fmt
예제 #18
0
    def get_sequence(self, file):
        '''
        get a nucleotide sequence

        Parameters
        ----------
        file : path
            Path to the file to open.

        Returns
        -------
        str
            mRNA sequence string.

        '''
        self.__check_valid_file(file)
        extension = file.split('.')[-1]
        if extension == 'dna':
            try:
                seq_record = snapgene_file_to_seqrecord(file)
            except:
                msg = 'To read .dna files please install snapegenereader: '\
                      ' pip install snapgene_reader - '\
                          'https://github.com/IsaacLuo/SnapGeneFileReader'
                raise SnapGeneMissingError(msg)
            sequence_str = str(seq_record.seq)

        if extension == 'txt':
            sequence_str = self.__get_seq_from_txt(file)

        if extension == 'gb':
            gb_record = SeqIO.read(open(file, "r"), "genbank")
            sequence_str = str(gb_record.seq)

        if extension == 'fasta':
            fastas = list(SeqIO.parse(file, 'fasta'))
            if len(fastas) > 1:
                return 'Multiple line fastas not supported'
            else:
                sequence_str = (str(fastas[0].seq))
        cleaned_sequence_str = self.clean_seq(sequence_str)

        return cleaned_sequence_str
예제 #19
0
def records_from_zip_file(zip_file):
    """Return all fasta/genbank/snapgene in a zip as Biopython records."""
    zip_file = flametree.file_tree(zip_file)
    records = []
    for f in zip_file._all_files:
        ext = f._extension.lower()
        if ext in ["gb", "gbk", "fa", "dna"]:
            try:
                new_records, fmt = string_to_record(f.read())
            except Exception:
                content_stream = BytesIO(f.read("rb"))
                try:
                    record = snapgene_file_to_seqrecord(
                        fileobject=content_stream)
                    new_records, _ = [record], "snapgene"
                except Exception:
                    try:
                        parser = crazydoc.CrazydocParser(
                            ["highlight_color", "bold", "underline"])
                        new_records = parser.parse_doc_file(content_stream)
                        # fmt = "doc"
                    except Exception:
                        raise ValueError("Format not recognized for file " +
                                         f._path)

            single_record = len(new_records) == 1
            for i, record in enumerate(new_records):
                name = record.id
                if name in [
                        None,
                        "",
                        "<unknown id>",
                        ".",
                        " ",
                        "<unknown name>",
                ]:
                    number = "" if single_record else ("%04d" % i)
                    name = f._name_no_extension.replace(" ", "_") + number
                record.id = name
                record.name = name
                record.file_name = f._name_no_extension
            records += new_records
    return records
예제 #20
0
def load_records(path):
    if isinstance(path, (list, tuple)):
        return [record for p in path for record in load_records(p)]
    no_extension, extension = os.path.splitext(path)
    fmt = {
        '.fa': 'fasta',
        '.gb': 'genbank',
        '.gbk': 'genbank',
        '.dna': 'snapgene'
    }[extension]
    if fmt == 'snapgene':
        records = [snapgene_file_to_seqrecord(path)]
    else:
        records = list(SeqIO.parse(path, fmt))
    for i, record in enumerate(records):
        if str(record.id) in ['None', '', "<unknown id>", '.', ' ']:
            record.id = path.replace("/", "_").replace("\\", "_")
            if len(records) > 1:
                record.id += "_%04d" % i
    return records
예제 #21
0
def records_from_data_file(data_file):
    content = b64decode(data_file.content.split("base64,")[1])
    try:
        records, fmt = string_to_record(content.decode("utf-8"))
    except:
        try:
            record = snapgene_file_to_seqrecord(fileobject=BytesIO(content))
            records, fmt = [record], 'snapgene'
        except:
            try:
                parser = crazydoc.CrazydocParser(
                    ['highlight_color', 'bold', 'underline'])
                records = parser.parse_doc_file(BytesIO(content))
                fmt = 'doc'
            except:
                raise ValueError("Format not recognized for file " +
                                 data_file.name)
    if not isinstance(records, list):
        records = [records]
    return records, fmt
예제 #22
0
def load_record(filepath, linear=True, name="unnamed", file_format="auto"):
    """Load a FASTA/Genbank/Snapgene record.

    Note that reading Snapgene records requires the library snapgene_reader
    installed.
    """
    if file_format != "auto":
        record = SeqIO.read(filepath, file_format)
    elif filepath.lower().endswith(("gb", "gbk")):
        record = SeqIO.read(filepath, "genbank")
    elif filepath.lower().endswith(("fa", "fasta")):
        record = SeqIO.read(filepath, "fasta")
    elif filepath.lower().endswith(".dna"):
        record = snapgene_file_to_seqrecord(filepath)
    else:
        raise ValueError("Unknown format for file: %s" % filepath)
    record.linear = linear
    if name != "unnamed":
        record.id = name
        record.name = name.replace(" ", "_")[:20]
    return record
예제 #23
0
def load_record(filename, record_id="auto", upperize=False, id_cutoff=20):
    """Load a Fasta/Genbank/Snapgene file as a Biopython record.

    Parameters
    ==========
    filename
      Path to the file containing the record.

    record_id
      Id of the record (leave to "auto" to keep the record's original Id, which
      will default to the file name if the record has no Id).

    upperize
      If true, the record's sequence will be upperized.

    id_cutoff
      If the Id is read from a filename, it will get truncated at this cutoff
      to avoid errors at report write time.
    """
    if filename.lower().endswith(("gb", "gbk")):
        record = SeqIO.read(filename, "genbank")
    elif filename.lower().endswith(("fa", "fasta")):
        record = SeqIO.read(filename, "fasta")
    elif filename.lower().endswith(".dna"):
        record = snapgene_file_to_seqrecord(filename)
    else:
        raise ValueError("Unknown format for file: %s" % filename)
    if upperize:
        record = record.upper()
    if record_id == "auto":
        record_id = record.id
        if record_id in [None, "", "<unknown id>", ".", " "]:
            record_id = os.path.splitext(os.path.basename(filename))[0]
            record.id = record_id
            record.name = record_id.replace(" ", "_")[:id_cutoff]
        record.id = record_id
    elif record_id is not None:
        record.id = record_id
        record.name = record_id.replace(" ", "_")[:id_cutoff]
    return record
예제 #24
0
def records_from_zip_file(zip_file):
    zip_file = flametree.file_tree(file_to_filelike_object(zip_file))
    records = []
    for f in zip_file._all_files:
        ext = f._extension.lower()
        if ext in ['gb', 'fa', 'dna']:
            try:
                new_records, fmt = string_to_record(f.read())
            except:
                content_stream = BytesIO(f.read('rb'))
                try:
                    record = snapgene_file_to_seqrecord(
                        fileobject=content_stream)
                    new_records, fmt = [record], 'snapgene'
                except:
                    try:
                        parser = crazydoc.CrazydocParser(
                            ['highlight_color', 'bold', 'underline'])
                        new_records = parser.parse_doc_file(content_stream)
                        fmt = 'doc'
                    except:
                        raise ValueError("Format not recognized for file " +
                                         f._path)

            single_record = len(new_records) == 1
            for i, record in enumerate(new_records):
                name = record.id
                if name in [
                        None, '', "<unknown id>", '.', ' ', "<unknown name>"
                ]:
                    number = ('' if single_record else ("%04d" % i))
                    name = f._name_no_extension.replace(" ", "_") + number
                name = name.split(".")[0]
                record.id = name
                record.name = name
                record.file_name = f._name_no_extension
            records += new_records
    print([(r.name, r.id) for r in records])
    return records
예제 #25
0
def string_to_records(string):
    """Convert a string of a fasta, genbank... into a simple ATGC string.

    Can also be used to detect a format.
    """
    matches = re.match("([ATGC][ATGC]*)", string)
    if (matches is not None) and (matches.groups()[0] == string):
        return [sequence_to_biopython_record(string)], "ATGC"

    for fmt in ("fasta", "genbank"):
        try:
            stringio = StringIO(string)
            records = list(SeqIO.parse(stringio, fmt))
            if len(records) > 0:
                return (records, fmt)
        except Exception:
            pass
    try:
        record = snapgene_file_to_seqrecord(filecontent=StringIO(string))
        return [record]
    except Exception:
        pass
    raise ValueError("Invalid sequence format")
예제 #26
0
def load_record(filename, linear=True, id='auto', upperize=True):
    if filename.lower().endswith(("gb", "gbk")):
        record = SeqIO.read(filename, "genbank")
    elif filename.lower().endswith(('fa', 'fasta')):
        record = SeqIO.read(filename, "fasta")
    elif filename.lower().endswith('.dna'):
        record = snapgene_file_to_seqrecord(filename)
    else:
        raise ValueError('Unknown format for file: %s' % filename)
    if upperize:
        record = record.upper()
    record.linear = linear
    if id == 'auto':
        id = record.id
        if id in [None, '', "<unknown id>", '.', ' ']:
            id = os.path.splitext(os.path.basename(filename))[0]
            record.name = id.replace(" ", "_")[:20]
        record.id = id
    elif id is not None:
        record.id = id
        record.name = id.replace(" ", "_")[:20]

    return record
예제 #27
0
def get(fname):
    """ Gather all sequences from a file """
    return snapgene_file_to_seqrecord(fname)
예제 #28
0
def load_record(
    record_file,
    topology="default_linear",
    id="auto",
    upperize=True,
    max_name_length=20,
    file_format=None,
):
    """Read a record (from many different input formats).

    Parameters
    ----------

    record_file
      A genbank file, a fasta file, a snapgene file, or a filelike object
      (at which case the format, genbank or fasta, must be given with
      ``file_format``)

    topology
      Either "circular" or "linear" or "default_to_circular" (will default
    to circular if ``annotations['topology']`` is not already set) or

    id
      Will be used for the record ID and name. If auto, the record id will
      be unchanged unless it is ".", " ", etc. at which case it will be
      replaced by the file name.

    upperize
      If true, the sequence will get upperized.

    max_name_length
      The name of the record will be truncated if too long to avoid Biopython
      exceptions being raised.

    file_format
      Indicates the file format for the parser, when record_file is a filelike
      object.

    """
    if file_format is not None:
        record = SeqIO.read(record_file, file_format)
    elif record_file.lower().endswith(("gb", "gbk")):
        record = SeqIO.read(record_file, "genbank")
    elif record_file.lower().endswith(("fa", "fasta")):
        record = SeqIO.read(record_file, "fasta")
    elif record_file.lower().endswith(".dna"):
        record = snapgene_file_to_seqrecord(record_file)
    else:
        raise ValueError("Unknown format for file: %s" % record_file)
    if upperize:
        record = record.upper()
    set_record_topology(record, topology)
    if id == "auto":
        id = record.id
        if id in [None, "", "<unknown id>", ".", " "]:
            id = os.path.splitext(os.path.basename(record_file))[0]
            record.name = id.replace(" ", "_")[:max_name_length]
        record.id = id
    elif id is not None:
        record.id = id
        record.name = id.replace(" ", "_")[:max_name_length]

    return record