def extract_descriptors_from_file_to_pickle(inputfile, outputfile, num_pos_sample=0): print("Working on: " + str(inputfile)) print(" ") s_read_seq = time.time() if reduce_by_similarity == 1: if "_reduced" in inputfile: print( "File already reduced to be maximum 90 percent identical! Clear reduce_by_similarity!" ) input() elif ".txt" in inputfile: name = inputfile.replace('.txt', '') file_to_reduce = open(inputfile) lines = file_to_reduce.readlines() if num_pos_sample != 0: lines = lines[:round(sc_1 * num_pos_sample)] line_number = len(lines) file_to_reduce.close() elif ".fasta" in inputfile: name = inputfile.replace('.fasta', '') lines = IO.read_fasta_file(inputfile) lines = [str(line) for line in lines] if num_pos_sample != 0: lines = lines[:round(2 * num_pos_sample)] line_number = len(lines) else: print( "Unknown file format! Use .fasta or .txt! Press CTRL-C to exit" ) input() out = name + "_reduced.txt" deleted = [] sim_array = np.zeros((line_number, line_number)) for i in list(range(line_number)): print("Doing line %d out of %d" % (i, line_number)) string1 = lines[i].strip() for j in list(range(i + 1, line_number)): #print(j) string2 = lines[j].strip() if similar(string1, string2) >= 0.9: sim_array[i, j] = 1 sim_array[j, i] = 1 while np.sum(np.sum(sim_array, 0)) != 0: sum_arr = np.sum(sim_array, 0) idx_to_be_deleted = np.argmax(sum_arr) sim_array = np.delete(sim_array, idx_to_be_deleted, 0) sim_array = np.delete(sim_array, idx_to_be_deleted, 1) deleted.append(lines[idx_to_be_deleted]) del lines[idx_to_be_deleted] print("Deleted items:") [print(item) for item in deleted] f = open(out, "w+") for line in lines: f.write(line) f.write("\n") f.close() inputfile = out if ".txt" in inputfile: seqs = [] with open(inputfile) as f: for line in f: seqs.append( line.strip()) #strip is important otherwis /n issue! inputfile = inputfile.replace("_reduced.txt", "") elif ".fasta" in inputfile: seqs = IO.read_fasta_file(inputfile) inputfile = inputfile.replace("_reduced.fasta", "") else: print("Unknown file format! Use .fasta or .txt! Press CTRL-C to exit") input() e_read_seq = time.time() print("Total time to read sequences: " + str(e_read_seq - s_read_seq)) print(str(len(seqs))) chars = set('ARNDCQEGHILKMFPSTWYV') if inputfile in negfile: if num_pos_sample == 0: print("Error, use Ctrl-C to quit") input() print(num_pos_sample) if num_pos_sample > len(seqs): print( "Warning: Class balance may not be achieved! Click any button to accept or CTRL-C to exit" ) input() a = random.sample( range(1, len(seqs)), round(sc_2 * num_pos_sample) ) #if total_samples is big, you may want to divide total_samples (by 18) and round it newseqs = [] i = 1 for number in a: print(i) if len(seqs[number]) > minlength and all( (c in chars) for c in seqs[number].upper()): newseqs.append(seqs[number]) print(seqs[number]) i = i + 1 if i > num_pos_sample: break if i < num_pos_sample: print( "The negative set does not contain enough valid inputs to make the classifier balanced. Reduce downsampling! Use CTRL-C to quit!" ) input() seqs = newseqs total_samples = len(seqs) s_x_desc = time.time() dvecs = [] current_seq = 1 dropped = 0 for s in seqs: s = s.upper() if inputfile not in negfile: #because if it is valid sequence test has already been done,do not do it twice, purely for efficiency if not all((c in chars) for c in s) or len(s) < (minlength + 1): dropped = dropped + 1 continue print("Extracting descriptors for sequence: " + str(current_seq) + "/" + str(total_samples)) s_x_seq = time.time() dvec = FX.extract_named_descriptors_of_seq(s) dvecs.append(dvec) if inputfile in posfile: num_pos_sample = len(dvecs) print("Number of positive samples: %d" % (num_pos_sample)) print( "Number of samples dropped due to meaningless characters: %d" % (dropped)) e_x_seq = time.time() print("Took: " + str(e_x_seq - s_x_seq)) print(" ") current_seq += 1 e_x_desc = time.time() print("Total time to extract descriptors: " + str(e_x_desc - s_x_desc)) IO.serialize_descriptor_vector(dvecs, o_file=outputfile) return num_pos_sample
def extract_descriptors_from_file_to_pickle(inputfile, outputfile, num_pos_sample=0): print("Working on: " + str(inputfile)) print(" ") s_read_seq = time.time() if reduce_by_similarity == 1: if "_reduced" in inputfile: print( "File already reduced to be maximum 90 percent identical! Clear reduce_by_similarity!" ) input() elif ".txt" in inputfile: name = inputfile.replace('.txt', '') file_to_reduce = open(inputfile) lines = file_to_reduce.readlines() if num_pos_sample != 0: lines = lines[:round(sc_1 * num_pos_sample)] line_number = len(lines) file_to_reduce.close() elif ".fasta" in inputfile: name = inputfile.replace('.fasta', '') lines = IO.read_fasta_file(inputfile) lines = [str(line) for line in lines] if num_pos_sample != 0: lines = lines[:round(sc_1 * num_pos_sample)] line_number = len(lines) else: print( "Unknown file format! Use .fasta or .txt! Press CTRL-C to exit" ) input() out = name + "_reduced.txt" deleted = [] sim_array = np.zeros((line_number, line_number)) for i in list(range(line_number)): print("Doing line %d out of %d" % (i, line_number)) string1 = lines[i].strip() for j in list(range(i + 1, line_number)): #print(j) string2 = lines[j].strip() if similar(string1, string2) >= 0.9: sim_array[i, j] = 1 sim_array[j, i] = 1 while np.sum(np.sum(sim_array, 0)) != 0: sum_arr = np.sum(sim_array, 0) idx_to_be_deleted = np.argmax(sum_arr) sim_array = np.delete(sim_array, idx_to_be_deleted, 0) sim_array = np.delete(sim_array, idx_to_be_deleted, 1) deleted.append(lines[idx_to_be_deleted]) del lines[idx_to_be_deleted] print("Deleted items:") [print(item) for item in deleted] f = open(out, "w+") for line in lines: f.write(line) f.write("\n") f.close() inputfile = out if ".txt" in inputfile: seqs = [] with open(inputfile) as f: for line in f: seqs.append( line.strip()) #strip is important otherwis /n issue! inputfile = inputfile.replace("_reduced.txt", "") elif ".fasta" in inputfile: seqs = IO.read_fasta_file(inputfile) inputfile = inputfile.replace("_reduced.fasta", "") else: print("Unknown file format! Use .fasta or .txt! Press CTRL-C to exit") input() e_read_seq = time.time() print("Total time to read sequences: " + str(e_read_seq - s_read_seq)) print(str(len(seqs))) chars = set('ARNDCQEGHILKMFPSTWYV') if inputfile in negfile: if num_pos_sample == 0: print("Error, use Ctrl-C to quit") input() print(num_pos_sample) if num_pos_sample > len(seqs): print( "Warning: Class balance may not be achieved! Click any button to accept or CTRL-C to exit" ) input() a = random.sample( range(1, len(seqs)), round(sc_2 * num_pos_sample) ) #if total_samples is big, you may want to divide total_samples (by 18) and round it newseqs = [] i = 1 for number in a: print(i) if len(seqs[number]) > minlength and all( (c in chars) for c in seqs[number].upper()): newseqs.append(seqs[number]) print(seqs[number]) i = i + 1 if i > num_pos_sample: break if i < num_pos_sample: print( "The negative set does not contain enough valid inputs to make the classifier balanced. Reduce downsampling! Use CTRL-C to quit!" ) input() seqs = newseqs #s_x_desc = time.time() manager = Manager().list() current_seq = Value('i', 1) dropped = 0 lock = Lock() seqs = [s.upper() for s in seqs] mask = [all((c in chars) for c in s) and len(s) > minlength for s in seqs] seqs = list(compress(seqs, mask)) total_samples = len(seqs) pool = Pool(numcores, initializer, (current_seq, dvecs, total_samples, lock)) s_parallel = time.time() pool.map(thefunction, seqs) e_parallel = time.time() #pool.close() #pool.join() print("Total time to extract descriptors: " + str(e_parallel - s_parallel)) if inputfile in posfile: num_pos_sample = len(dvecs) print("Number of positive samples: %d" % (num_pos_sample)) #e_x_desc = time.time() #print("Total time to extract descriptors: " + str(e_x_desc - s_x_desc)) print("Number of samples dropped due to meaningless characters: %d" % (dropped)) y = dvecs._callmethod('__getitem__', (slice( 1, total_samples + 1), )) #THIS IS THE SOLUTION TO MAKE PICKLE WORK!!!!!! IO.serialize_descriptor_vector(y, o_file=outputfile) return num_pos_sample