def generate_files4blast_xen(dict_in, fasta_in, path_for_files): """ Inputs: dict_in - the dictionary where the keys are the xenopus reference names that have phosphrylated residue fasta_in - the fasta file that has the xenopus reference information path_for_files Outputs: Nothing is returned """ # read the fasta file into a dictionary fasta_dict = {} with open(fasta_in) as fasta_file: parser = fastaparser.Reader(fasta_file) for seq in parser: # seq is a FastaSequence object fasta_dict[seq.id] = seq.sequence_as_string() for key in dict_in.keys(): if key in fasta_dict.keys(): # generate the file #file_name_1 = row[col_ref] + ".fa" file_name = pathlib.Path(path_for_files, key + ".fa") # use fastaparser to write the single entry with open(file_name, 'w') as fasta_file: writer = fastaparser.Writer(fasta_file) writer.writefasta((key,fasta_dict[key]))
def read_fasta_file(self, fasta_file): with open(fasta_file) as fasta_file: parser = fastaparser.Reader(fasta_file) for seq in parser: sequence = seq.sequence_as_string() return sequence
def open_genome_file(): dna_string = "" with open(genome_directory + genome_file_name) as fasta_file: parser = fastaparser.Reader(fasta_file) for seq in parser: dna_string += seq.sequence_as_string() return dna_string
def test_design_crispr_grnas( self, discover_client: DISCOVERClient, ): # Fasta file seq_filepath = get_project_root( ) / 'teselagen/examples/pytested/dummy_organism.fasta' # Load file with open(seq_filepath) as fasta_file: parser = fastaparser.Reader(fasta_file) for seq in parser: fasta_seq: str = cast(FastaSequence, seq).sequence_as_string() break # Call method to be tested res = discover_client.design_crispr_grnas( sequence=fasta_seq, target_indexes=(500, 600), ) assert isinstance(res, dict) assert 'guides' in res assert 'target_indexes' in res assert len(res['guides']) == 7
def generate_sequences(path, chr_=''): import fastaparser if chr_ != '': with open("data/genomes/hg19_hard_50.fa") as fasta_file: parser = fastaparser.Reader(fasta_file, parse_method='quick') for seq in parser: if seq.header == chr_: print(seq.header) main_funct(path, seq.sequence) else: main_funct(path)
def fasta_for(path_to_zipfile): ''' Return fasta parser for RefSeq CDS. path_to_zipfile: The relative path to the zipfile containing the virus data report ''' fasta = {} with zipfile.ZipFile(path_to_zipfile, 'r') as zip: with io.TextIOWrapper(zip.open('ncbi_dataset/data/cds.fna')) as fh: for seq in fastaparser.Reader(fh): fasta[seq.id] = seq.sequence_as_string() return fasta
def fasta_to_json(fasta_path: Path) -> None: benchmark_dct = {} with fasta_path.open() as fasta_file: parser = fastaparser.Reader(fasta_file, parse_method="quick") for sequence in parser: protein_id = sequence.header.split("|")[1] sequence_str = sequence.sequence benchmark_dct[protein_id] = sequence_str benchmark_dct = {"inputs": benchmark_dct} with BENCHMARK_FILEPATH.open("w") as fp: json.dump(benchmark_dct, fp)
def fasta_parser(path): fasta_df = pd.DataFrame() print("Done") with gzip.open(path, 'rt') as fasta_file: reader = fastaparser.Reader(fasta_file) print("File loaded") fasta_df['ID'] = [seq.id for seq in reader] print("Column 1 of 3 added") fasta_df['Description'] = [seq.description for seq in reader] print("Column 2 of 3 added") fasta_df['Sequence'] = [seq.sequence_as_string() for seq in reader] print("Column 3 of 3 added") return fasta_df
def filter_fasta(fasta_in, rem_str, fasta_out): """ INPUTS: fasta_in - name with path from current folder for the input fasta file rem_str - the regular expression string to find what we want to remove from sequence ids fasta_out - name with path from curret folder for the output fasta file OUTPUTS: fasta_dict - the dictionary of non-isofrm fasta sequences to use to defined the human motif ** fasta_out is written """ # create the dictionary for the non-isoform references in the input # human references that we are going to store fasta_dict = {} # read in the sequences from the fasta file with fastaparser with open(fasta_in) as fasta_file: parser = fastaparser.Reader(fasta_file) # seq is a FastaSequence object for seq in parser: # check to see if the sequence id includes "iso" # if it does then don't add it to the fasta_dict if(re.search(rem_str,seq.id)): pass # if it doesn't have iso then do add it to eh dictionary else: fasta_dict[seq.id] = seq.sequence_as_string() # now right the fasta sequences stored in the dictionary to a text file with open(fasta_out, 'w') as fasta_file: writer = fastaparser.Writer(fasta_file) for seq_id in fasta_dict.keys(): writer.writefasta((seq_id, fasta_dict[seq_id])) # return the dictionary of non-isoform fast sequences so that the human # motifs can be found return fasta_dict
def winnow_fasta(fasta_fp, base_empress_df, out_stringent_fp): with open(out_stringent_fp, 'w') as stringent_file: with open(fasta_fp) as fasta_file: parser = fastaparser.Reader(fasta_file) for seq in parser: # match header to consensus_seq_name column of metadata # and get the usable_for value for it seq_metadata_df = base_empress_df.loc[ base_empress_df[CONS_NAME] == seq.id] if len(seq_metadata_df) == 0: continue elif len(seq_metadata_df) > 1: raise ValueError(f"More than one metadata row with" f"consensus sequence name " f"'{seq.id}' found") else: seq_stringent_test_val = \ seq_metadata_df.loc[:, STRINGENT_TEST_COL].iat[0] fasta_str = seq.formatted_fasta() + '\n' if seq_stringent_test_val == STRINGENT_INCLUDE_VAL: stringent_file.write(fasta_str)
#!/usr/bin/env python3 #https://github.com/mor16fsu/bch5884 #Mitchell Roth from DNALib import * import fastaparser from matplotlib import pyplot as plt print("") print("The DNA input sequence is:") #Parse FASTA file with open("mEGFP2.fa") as fasta_file: parser = fastaparser.Reader(fasta_file, parse_method='quick') for seq in parser: print(seq.sequence) print() print("") print("If error in sequence print False, if no errors print None") print(checkseq(seq.sequence)) print("") print("The frequency of each nucleotide is:") print(countnuc(seq.sequence)) print("") print("The transcript of the sequence is:") print(transcribe(seq.sequence))
def memGraph(seqeunce, N): memFuncArray = memFunction(seqeunce, N) x = [] for i in range(1, N + 1): x.append(i) plt.plot(x, memFuncArray) plt.xlabel("r") plt.ylabel("F") plt.show() root = tk.Tk() root.withdraw() filepath = filedialog.askopenfilename() with open(filepath) as fasta_file: reader = fp.Reader(fasta_file) for seq in reader: #print ('ID: ', seq.id) #print ('Description:', seq.description) #print ('Sequence: ', seq.sequence_as_string()) sequence = seq.sequence_as_string() #print (len(sequence)) #corGraph("A", "G", 20, seq.sequence_as_string()) #dispGraph(len(sequence), subsum(sequenceToBinary("A", "G", "", sequence))) corGraph("1", "1", 40, sequenceToBinary("A", "G", "", sequence)) #memGraph(sequenceToBinary("A", "G", "", sequence), 60) print("\n") fasta_file.close()
import fastaparser import sys seq_dict={} with open(sys.argv[1]) as fasta_file: parser=fastaparser.Reader(fasta_file) for seq in parser: seq_dict[seq.id]=seq.sequence_as_string() pairs=[] for s in seq_dict.keys(): suffix=seq_dict[s][-3:] for k in seq_dict.keys(): if s==k: continue prefix=seq_dict[k][:3] if suffix==prefix: pairs.append([s,k]) for pair in pairs: print(pair[0]+" "+pair[1])