if (dataset == "error"): print( "***WARNING: No file was found in the repository with the provided name: " + file_name + "\nNow aborting.") sys.exit() # If the dataset to be reconstructed is a genome dataset. If the line size is not specified, default it to 60 if (dataset == "Genome"): # Create index variable index = 0 # Inform the user print("\nNow reconstructing Genome dataset contained in file: " + file_name + ".fa") # Use reconstruct_dataset function to recreate the file reconstruct_dataset(size=int(line_size), directory=directory, output_file="../" + file_name, mode="Genome", seqID="0", region="0") # Inform the user print("\nReconstruction completed.") # Otherwise the dataset to be reconstructed is a dependent file else: # Reconstruct the variants dataset if (dataset == "Variants"): # Inform the user print("\nNow reconstructing Variants dataset contained in file: " + file_name) reconstruct_dataset(size=1, directory=directory, output_file="../" + file_name + ".temporary", mode="Variants",
if (x == (len(commit_list) - 2)): break else: #print("X: ",x) #print(commit_list[x]) print("\n\t***Finding Alignment Pickle {}***".format( str(datetime.datetime.now()))) print( "\n\t Warning: If you have added any data into the repository but did not commit, these changes will be lost." ) ShellCommand = Popen("git checkout " + commit_list[x] + " 2> /dev/null", shell=True).wait() reconstruct_dataset( size=60, directory="./Genome", output_file="./.git/info/temporary_directory/assembly_1.fa", mode="Genome") next_commit = x + 2 for y in range(next_commit, len(commit_list)): #print("Y: ",y) #print(commit_list[y]) # Reconstruct the second version of the assembly ShellCommand = Popen("git checkout " + commit_list[y] + " 2> /dev/null", shell=True).wait() reconstruct_dataset( size=60, directory="./Genome", output_file="./.git/info/temporary_directory/assembly_2.fa", mode="Genome")
def reconstruct_annotation_variants(ToUpdate): """ Reconstructs the variant and annotation dataset in the temporary directory. Requires: ToUpdate ({dataset:[[filename.extension,directory,size],[],...]}) which contains INFORMATION of what requires updating. """ for dataset in ToUpdate.keys(): # If the data are variants and annotation datasets, and there is data present, perform the following block if (dataset != "Genome" and len(ToUpdate[dataset]) != 0): # Loop through the files of the dataset and reconstruct the file in the temporary # directory with the same original name for subfile in ToUpdate[dataset]: reconstruct_dataset( size=1, directory=subfile[1], output_file="./temporary_directory/{}".format(subfile[0]), mode=dataset, update=True, seqID="0", region="0") # Create an appropiate tabix library for the file. If it is annotation or # alignment there are two pseudo files if (dataset == "Annotation" or dataset == "Alignment"): # Get all the headers (starts wiht #) for the alignment files, and all non header data, # sort column 1 and column 2 (numerically). # Then compress and create indexes and use tabix to create the library Popen( "(grep " "^#" " ./temporary_directory/{}_A; grep -v " "^#" " ./temporary_directory/{}_A | " "sort -V -k1,1 -k2,2n) | bgzip > ./temporary_directory/{}_A.gz; " "tabix -b 2 -e 2 ./temporary_directory/{}_A.gz;". format(subfile[0], subfile[0], subfile[0], subfile[0]), shell=True).wait() # Do this for dataset B too Popen( "(grep " "^#" " ./temporary_directory/{}_B; grep -v " "^#" " ./temporary_directory/{}_B | " "sort -V -k1,1 -k2,2n) | bgzip > ./temporary_directory/{}_B.gz; " "tabix -b 2 -e 2 ./temporary_directory/{}_B.gz;". format(subfile[0], subfile[0], subfile[0], subfile[0]), shell=True).wait() elif (dataset == "Variants"): # Same as above but with variant data instead Popen( "(grep " "^#" " ./temporary_directory/{}_A; grep -v " "^#" " ./temporary_directory/{}_A | " "sort -V -k1,1 -k2,2n) | bgzip > ./temporary_directory/{}_A.gz; " "tabix -b 2 -e 2 ./temporary_directory/{}_A.gz;". format(subfile[0], subfile[0], subfile[0], subfile[0]), shell=True).wait() # Otherwise there was an error else: print("***INTERNAL ERROR*** DATASET NOT RECOGNIZED: {}". format(dataset))
print( "\nNow producing a report of the data currently contained in the repository\n" ) # Create an output file output_file = open("../GenomeGit_Report_{}.txt".format(message), "w") output_file.write( "\n\nNow producing a report of the data currently contained in the repository\n" ) # If busco option was selected if (busco != "0"): print('\nNow running a BUSCO analysis') # Reconstruct the genome. can change name of file to actual fasta file? reconstruct_dataset(size=60, directory="./Genome", output_file="../{}.fa".format(message), mode="Genome", seqID="0", region="0") input_file = os.path.abspath('../{}.fa'.format(message)) if (lineage != "0"): buscoReport(input_file, lineage, message) else: buscoReport(input_file=input_file, output_folder=message) # Create dictionary about the genomic sequences # {seqKey:[seqID,lenght,contig_count,N_count,[{file:vcf},{file:gff},{file:sam}]]} genome_report = report_repo() # Print the information of the dictionary in form of a table # Loop through the sequences for seqKey in genome_report.keys():
# If the there are no files to update, inform the user update_inform_user(ToUpdate) ############################################### # PART 1. RECONSTRUCTION OF REPOSITORY DATA # ############################################### # Inform the user print("\n\t*PART I. RECONSTRUCTION OF REPOSITORY DATA.*") print("\t {}".format(str(datetime.datetime.now()))) # Create temporary directory os.mkdir("./temporary_directory") # Reconstruct the necessary datasets. First the old genome. reconstruct_dataset(size=60, directory="./Genome", output_file="./temporary_directory/genome_old.fa", mode="Genome", seqID="0", region="0") # Reconstruct all the files related to the variants and annotation datasets reconstruct_annotation_variants(ToUpdate) ############################################## # PART 2. OBTAIN THE ALIGNMENT INFORMATION # ############################################## # Obtain the information of the alignment between both assemblies. # Determine the alignment pickle alignment_pickle = obtain_alignment_pickle( "./temporary_directory/genome_old.fa", new_assembly) variables = obtain_variables(alignment_pickle) store_variables(variables=variables, alignment_pickle=alignment_pickle)