def __init__(self, pdb_code, sequence_annotations, SurfRes=False, pocket=False, lpocket=False): self.filename = pdb_code self.sequence_annotations = sequence_annotations if SurfRes: file_handlers = FileHandlers() file_paths = file_handlers.search_directory() txt_files = file_handlers.find_files(file_paths, 'txt') for txt_file in txt_files: if (self.filename + '_SurfRes.txt') == \ file_handlers.get_file_name(txt_file): self.surfres_file = txt_file else: self.surfres_file = '' if pocket: file_handlers = FileHandlers() file_paths = file_handlers.search_directory() txt_files = file_handlers.find_files(file_paths, 'txt') for txt_file in txt_files: if (self.filename + '_pocketres.txt') == \ file_handlers.get_file_name(txt_file): self.pocketres_file = txt_file else: self.pocketres_file = '' if lpocket: file_handlers = FileHandlers() file_paths = file_handlers.search_directory() txt_files = file_handlers.find_files(file_paths, 'txt') for txt_file in txt_files: if (self.filename + '_lpocket.txt') == \ file_handlers.get_file_name(txt_file): self.lpocket_file = txt_file else: self.lpocket_file = ''
def _get_outfile(self): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() out_files = file_handlers.find_files(file_paths, 'out') if (self.pdb_code != '' and self.psiblast != ''): for out_file in out_files: if (self.pdb_code + '_mutants') == \ file_handlers.get_file_name(out_file).split('.')[0]: self.ddG_results_filepath = out_file elif (self.psiblast + '_mutants') == \ file_handlers.get_file_name(out_file).split('.')[0]: self.llikelihood_filepath = out_file elif (self.pdb_code != '' and self.psiblast == ''): for out_file in out_files: if (self.pdb_code + '_mutants') == \ file_handlers.get_file_name(out_file).split('.')[0]: self.ddG_results_filepath = out_file print "Fetching data from %s ....." % \ file_handlers.get_file_name(out_file) elif (self.pdb_code == '' and self.psiblast != ''): for out_file in out_files: if (self.psiblast + '_mutants') == \ file_handlers.get_file_name(out_file).split('.')[0]: self.llikelihood_filepath = out_file else: print "You have not specified any results data to parse." exit(1)
def _get_file_path(self): os.chdir("./database/pdbs/pdb") file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pdb_files = file_handlers.find_files(file_paths, 'pdb') for pdb_file in pdb_files: if (self.filename + '_0001') == file_handlers.get_file_name(pdb_file).split('.')[0]: self.file_path = pdb_file self.out_file = file_handlers.get_file_name(pdb_file).split('.')[0] + '_pops.out' self.out_file_path = self.dir_path + '/' + self.out_file
def _get_data(self, file_tag): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() txt_files = file_handlers.find_files(file_paths, 'txt') for txt_file in txt_files: if self.filename == file_handlers.get_file_name(txt_file).split('_')[0]: if (file_tag.split('_')[1] + '.txt') == file_handlers.get_file_name(txt_file).split('_')[1]: TXT = open(txt_file) self.data = TXT.readlines() TXT.close()
def _get_data(self, file_tag): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() txt_files = file_handlers.find_files(file_paths, 'txt') for txt_file in txt_files: if self.filename == file_handlers.get_file_name(txt_file).split( '_')[0]: if (file_tag.split('_')[1] + '.txt' ) == file_handlers.get_file_name(txt_file).split('_')[1]: TXT = open(txt_file) self.data = TXT.readlines() TXT.close()
def _get_file_path(self): os.chdir("./database/pdbs/pdb") file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pdb_files = file_handlers.find_files(file_paths, 'pdb') for pdb_file in pdb_files: if self.filename == file_handlers.get_file_name(pdb_file).split( '.')[0]: self.file_path = pdb_file self.out_file = file_handlers.get_file_name(pdb_file).split( '.')[0] + '_pops.out' self.out_file_path = self.dir_path + '/' + self.out_file
def _get_filename(self): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pdb_files = file_handlers.find_files(file_paths, "pdb") if self.tag == "": for pdb_file in pdb_files: if self.filename == file_handlers.get_file_name(pdb_file).split(".")[0]: return file_handlers.get_file_name(pdb_file).split(".")[0] else: for pdb_file in pdb_files: if (self.filename + self.tag) == file_handlers.get_file_name(pdb_file).split(".")[0]: return file_handlers.get_file_name(pdb_file).split(".")[0]
def _get_file_path(self, ligand=False, pdb=False): #os.chdir("./database/pdbs/pdb") self.file_path = '' file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pdb_files = file_handlers.find_files(file_paths, 'pdb') if ligand == True: for pdb_file in pdb_files: if self.filename == file_handlers.get_file_name(pdb_file).split('.')[0]: self.file_path = pdb_file elif pdb == True: for pdb_file in pdb_files: if (self.filename + '_0001') == file_handlers.get_file_name(pdb_file).split('.')[0]: self.file_path = pdb_file
def _get_filepath(self, data_file=False, pdb_file=False): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() if data_file == True: files = file_handlers.find_files(file_paths, 'txt') for path in files: if (self.filename + '_mutant_list') == file_handlers.get_file_name(path).split('.')[0]: return path elif pdb_file == True: files = file_handlers.find_files(file_paths, 'pdb') for path in files: if (self.filename + '_0001') == file_handlers.get_file_name(path).split('.')[0]: return path else: print "Specify file type"
class PickleFasta: def __init__(self): self.file_handlers = FileHandlers() def _get_fasta_files(self): file_paths = self.file_handlers.search_directory() fasta_files = self.file_handlers.find_files(file_paths, 'faa') print "There are %d .faa files in this directory" % len(fasta_files) return fasta_files def pickle_organism_fasta(self): fasta_files = self._get_fasta_files() fasta_dictionary = {} for fasta_file in fasta_files: file_name = self.file_handlers.get_file_name(fasta_file) name_list = file_name.split('.') Data = open(fasta_file) D = Data.readlines() Data.close() for d in D: if d.startswith('>'): d_list = d.split(' ') if name_list[0] in fasta_dictionary: fasta_dictionary[name_list[0]].append(d_list[0].lstrip('>')) else: fasta_dictionary[name_list[0]] = [d_list[0].lstrip('>')] else: pass return fasta_dictionary
def main(): ## THIS WORKED, DON'T ERASE ## Save organism data to pickled dictionary #pickle_fasta = PickleFasta() #fasta_dictionary = pickle_fasta.pickle_organism_fasta() #print "There are %d entries in the fasta_dictionary" % len(fasta_dictionary) #pickle.dump(fasta_dictionary, open('organism_dictionary.pkl', 'wb')) file_handlers = FileHandlers() dm_files = get_dm_files() # Load the dictionary back from the pickle file print "Loading fasta_dictionary..." open_fasta = open('organism_dictionary.pkl', 'rb') fasta_dictionary = pickle.load(open_fasta) open_fasta.close() print "Length of fasta dictionary: ", len(fasta_dictionary) ## THIS WORKED, DON'T ERASE ## Build mapping dictionary and pickle dm_processing = dmDictionary() dm_processing.init_mapping_dictionary(fasta_dictionary) for path in dm_files: file_name = file_handlers.get_file_name(path) print "Opening %s..." % file_name dm_dictionary = dm_processing.individual_dm_dictionary(path, file_name) print "Length of dm_dictionary for %s is %d" % (file_name, len(dm_dictionary)) mapping_dictionary = dm_processing.build_mapping_dictionary(fasta_dictionary, dm_dictionary) open_mapping = open('mapping_dictionary.pkl', 'wb') pickle.dump(mapping_dictionary, open_mapping) open_mapping.close() print mapping_dictionary
def _get_pdb_file_path(self): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pdb_files = file_handlers.find_files(file_paths, 'pdb') for pdb_file in pdb_files: if (self.filename + '_0001') == file_handlers.get_file_name(pdb_file).split('.')[0]: self.file_path = pdb_file
def main(): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() fasta_files = file_handlers.find_files(file_paths, 'fasta') for path in fasta_files: file_name = file_handlers.get_file_name(path) get_dm_raxml(path, file_name, 4)
def build_SASA_dict(out_files): SASA_dict = {} for path in out_files: file_handlers = FileHandlers() file_name = file_handlers.get_file_name(path) SASA_dict[file_name] = {} for line in open(path): file_handlers2 = FileHandlers() fields = line.split('\t') cleaned = file_handlers2.clean(fields) if len(cleaned) == 9: #and int(cleaned[2]) >= 1: (position, aa, tot_SA, SASA, frac_SASA, phob, phil) = (cleaned[2], cleaned[0], cleaned[8], cleaned[5], cleaned[6], cleaned[3], cleaned[4]) SASA_dict[file_name][position] = [aa, tot_SA, SASA, frac_SASA, phob, phil] return SASA_dict
def main(): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pep_files = file_handlers.find_files(file_paths, "pep") for pep_file in pep_files: file_name = file_handlers.get_file_name(pep_file) run_muscle(pep_file, file_name)
def _get_downloaded_file_path(self, pdb_code): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() ent_files = file_handlers.find_files(file_paths, 'ent') for ent_file in ent_files: if pdb_code == file_handlers.get_file_name(ent_file).split('.')[0].lstrip('pdb').upper(): return ent_file
def _get_downloaded_file_path(self, pdb_code): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() ent_files = file_handlers.find_files(file_paths, 'ent') for ent_file in ent_files: if pdb_code == file_handlers.get_file_name(ent_file).split( '.')[0].lstrip('pdb').upper(): return ent_file
def _open_file(self): #os.chdir("../src/database/pdbs") file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pdb_files = file_handlers.find_files(file_paths, 'pdb') for pdb_file in pdb_files: if self.filename == file_handlers.get_file_name(pdb_file).split('.')[0]: Data = open(pdb_file) self.data = Data.readlines() Data.close
def _get_pdb(self): #os.chdir("./database/pdbs/pdb") file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pdb_files = file_handlers.find_files(file_paths, 'pdb') for pdb_file in pdb_files: if self.filename == file_handlers.get_file_name(pdb_file).split('.')[0]: PDB = open(pdb_file) self.pdb = PDB.readlines() PDB.close()
def _get_data(self, filename): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() txt_files = file_handlers.find_files(file_paths, 'txt') for txt_file in txt_files: if filename == file_handlers.get_file_name(txt_file): TXT = open(txt_file) data = TXT.readlines() TXT.close() return data
def _get_gb_record(self): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() gb_files = file_handlers.find_files(file_paths, 'gb') print gb_files print self.genbank_id for gb_file in gb_files: if self.genbank_id == file_handlers.get_file_name(gb_file).split( '.')[0]: self.gb_file_path = gb_file print self.gb_file_path
def _get_pdb(self): #os.chdir("./database/pdbs/pdb") file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pdb_files = file_handlers.find_files(file_paths, 'pdb') for pdb_file in pdb_files: if self.filename == file_handlers.get_file_name(pdb_file).split( '.')[0]: PDB = open(pdb_file) self.pdb = PDB.readlines() PDB.close()
def _get_pdb(self, rosetta_min=False, refined_pocket=False): file_handlers = FileHandlers() file_paths = file_handlers.search_directory() pdb_files = file_handlers.find_files(file_paths, 'pdb') for pdb_file in pdb_files: if rosetta_min == True and refined_pocket == True: print "Invalid input" elif rosetta_min == True: if (self.filename + '_0001') == file_handlers.get_file_name(pdb_file).split('.')[0]: print "Found ", (self.filename + '_0001.pdb') filepath = pdb_file elif refined_pocket == True: if ('pocket0') == file_handlers.get_file_name(pdb_file).split('.')[0]: print "Found pocket0.pdb" filepath = pdb_file else: if self.filename == file_handlers.get_file_name(pdb_file).split('.')[0]: print "Found ", (self.filename + '.pdb') filepath = pdb_file return filepath
def find_indices(path): file_handlers = FileHandlers() #for path in fasta_files: indices = [] open_file = open(path, 'rU') file_list = open_file.readlines() for line in file_list: if '>' in line: indices.append(file_list.index(line)) interval = indices[1] - indices[0] file_name = file_handlers.get_file_name(path) return file_name, file_list, interval
def build_distance_dict(dm_files, inverse_mapping_dict, organism_pairs): file_handlers = FileHandlers() for dm_file in dm_files: file_name = file_handlers.get_file_name(dm_file) print "Opening %s...." % file_name data = open(dm_file) D = data.readlines() data.close() for d in D: data = d.strip().split(' ') enz1 = data[0] enz2 = data[1] distance = data[-1] for key in organism_pairs: if enz1 in inverse_mapping_dict and enz2 in inverse_mapping_dict: if inverse_mapping_dict[enz1] in key and inverse_mapping_dict[enz2] in key: #print inverse_mapping_dict[enz1], inverse_mapping_dict[enz2], key organism_pairs[key].append(distance) #print organism_pairs[key] #print "Length of distance list is %d" % len(organism_pairs[key]) else: print "Could not find %s and %s in mapping_dict" % (enz1, enz2) print "Finished parsing %s...." % file_name return organism_pairs
import subprocess from util import FileHandlers from Bio.Phylo.TreeConstruction import DistanceCalculator from Bio import AlignIO file_handlers = FileHandlers() #file_paths = file_handlers.search_directory() #fasta_files = file_handlers.find_files(file_paths, 'faa') #for path in fasta_files: # cmd = ['perl ./Scripts/MarkerScanner.pl -Bacteria ' + path] # subprocess.call(cmd, shell=True) file_paths = file_paths = file_handlers.search_directory() pep_files = file_handlers.find_files(file_paths, 'pep') for path in pep_files: file_name = file_handlers.get_file_name(path) name_list = file_name.split('.') out_file = ''.join([name_list[0] + '_out.' + name_list[1]]) cmd = ['muscle -in ' + path + ' -out ' + out_file] subprocess.call(cmd, shell=True) #aln = AlignIO.read('path/to/alignnment/file', 'format (i.e. phylip)') #calculator = DistanceCalculator('identity') # identity is the name of the model(scoring matrix) to calculate the distance. The identity model is the default one and can be used both for DNA and protein sequence. #dm = calculator.get_distance(aln)
# sequences = open_file.readlines() # i = 0 # while i < len(sequences): # if sequences[i] in temp_dict: # i += 14 # else: # temp_dict[sequences[i]] = sequences[i + 1 : i + 13] # new_file.write(str(i) + "\n") # for item in temp_dict[sequences[i]]: # new_file.write(item) # i += 14 fasta_files = file_handlers.find_files(file_paths, "fasta") for path in fasta_files: file_name = file_handlers.get_file_name(path) print file_name name_list = file_name.split(".") # derep_out_file = ''.join(name_list[0] + '_uniques.fasta') dm_out_file = "".join(name_list[0] + "_dm.txt") # cmd = ['usearch -derep_fulllength ' + path + ' -fastaout ' + derep_out_file] # subprocess.call(cmd, shell=True) new_file = open("/Users/andrea/repositories/AMPHORA2/muscle_alignments/" + dm_out_file, "w") aln = AlignIO.read(path, "fasta") calculator = DistanceCalculator( "identity" ) # identity is the name of the model(scoring matrix) to calculate the distance. The identity model is the default one and can be used both for DNA and protein sequence. dm = calculator.get_distance(aln) new_file.write(dm) new_file.close()