Пример #1
0
 if (dataset == "error"):
     print(
         "***WARNING: No file was found in the repository with the provided name: "
         + file_name + "\nNow aborting.")
     sys.exit()
 # If the dataset to be reconstructed is a genome dataset. If the line size is not specified, default it to 60
 if (dataset == "Genome"):
     # Create index variable
     index = 0
     # Inform the user
     print("\nNow reconstructing Genome dataset contained in file: " +
           file_name + ".fa")
     # Use reconstruct_dataset function to recreate the file
     reconstruct_dataset(size=int(line_size),
                         directory=directory,
                         output_file="../" + file_name,
                         mode="Genome",
                         seqID="0",
                         region="0")
     # Inform the user
     print("\nReconstruction completed.")
 # Otherwise the dataset to be reconstructed is a dependent file
 else:
     # Reconstruct the variants dataset
     if (dataset == "Variants"):
         # Inform the user
         print("\nNow reconstructing Variants dataset contained in file: " +
               file_name)
         reconstruct_dataset(size=1,
                             directory=directory,
                             output_file="../" + file_name + ".temporary",
                             mode="Variants",
Пример #2
0
 if (x == (len(commit_list) - 2)):
     break
 else:
     #print("X: ",x)
     #print(commit_list[x])
     print("\n\t***Finding Alignment Pickle {}***".format(
         str(datetime.datetime.now())))
     print(
         "\n\t Warning: If you have added any data into the repository but did not commit, these changes will be lost."
     )
     ShellCommand = Popen("git checkout " + commit_list[x] +
                          " 2> /dev/null",
                          shell=True).wait()
     reconstruct_dataset(
         size=60,
         directory="./Genome",
         output_file="./.git/info/temporary_directory/assembly_1.fa",
         mode="Genome")
     next_commit = x + 2
     for y in range(next_commit, len(commit_list)):
         #print("Y: ",y)
         #print(commit_list[y])
         # Reconstruct the second version of the assembly
         ShellCommand = Popen("git checkout " + commit_list[y] +
                              " 2> /dev/null",
                              shell=True).wait()
         reconstruct_dataset(
             size=60,
             directory="./Genome",
             output_file="./.git/info/temporary_directory/assembly_2.fa",
             mode="Genome")
Пример #3
0
def reconstruct_annotation_variants(ToUpdate):
    """
    Reconstructs the variant and annotation dataset in the temporary directory.
    Requires: ToUpdate ({dataset:[[filename.extension,directory,size],[],...]}) which contains INFORMATION
    of what requires updating.
    """

    for dataset in ToUpdate.keys():
        # If the data are variants and annotation datasets, and there is data present, perform the following block
        if (dataset != "Genome" and len(ToUpdate[dataset]) != 0):
            # Loop through the files of the dataset and reconstruct the file in the temporary
            # directory with the same original name
            for subfile in ToUpdate[dataset]:
                reconstruct_dataset(
                    size=1,
                    directory=subfile[1],
                    output_file="./temporary_directory/{}".format(subfile[0]),
                    mode=dataset,
                    update=True,
                    seqID="0",
                    region="0")
                # Create an appropiate tabix library for the file. If it is annotation or
                # alignment there are two pseudo files
                if (dataset == "Annotation" or dataset == "Alignment"):
                    # Get all the headers (starts wiht #) for the alignment files, and all non header data,
                    # sort column 1 and column 2 (numerically).
                    # Then compress and create indexes and use tabix to create the library
                    Popen(
                        "(grep "
                        "^#"
                        " ./temporary_directory/{}_A; grep -v "
                        "^#"
                        " ./temporary_directory/{}_A | "
                        "sort -V -k1,1 -k2,2n) | bgzip > ./temporary_directory/{}_A.gz; "
                        "tabix -b 2 -e 2 ./temporary_directory/{}_A.gz;".
                        format(subfile[0], subfile[0], subfile[0], subfile[0]),
                        shell=True).wait()
                    # Do this for dataset B too
                    Popen(
                        "(grep "
                        "^#"
                        " ./temporary_directory/{}_B; grep -v "
                        "^#"
                        " ./temporary_directory/{}_B | "
                        "sort -V -k1,1 -k2,2n) | bgzip > ./temporary_directory/{}_B.gz; "
                        "tabix -b 2 -e 2 ./temporary_directory/{}_B.gz;".
                        format(subfile[0], subfile[0], subfile[0], subfile[0]),
                        shell=True).wait()
                elif (dataset == "Variants"):
                    # Same as above but with variant data instead
                    Popen(
                        "(grep "
                        "^#"
                        " ./temporary_directory/{}_A; grep -v "
                        "^#"
                        " ./temporary_directory/{}_A | "
                        "sort -V -k1,1 -k2,2n) | bgzip > ./temporary_directory/{}_A.gz; "
                        "tabix -b 2 -e 2 ./temporary_directory/{}_A.gz;".
                        format(subfile[0], subfile[0], subfile[0], subfile[0]),
                        shell=True).wait()
                # Otherwise there was an error
                else:
                    print("***INTERNAL ERROR*** DATASET NOT RECOGNIZED: {}".
                          format(dataset))
Пример #4
0
    print(
        "\nNow producing a report of the data currently contained in the repository\n"
    )
    # Create an output file
    output_file = open("../GenomeGit_Report_{}.txt".format(message), "w")
    output_file.write(
        "\n\nNow producing a report of the data currently contained in the repository\n"
    )

# If busco option was selected
if (busco != "0"):
    print('\nNow running a BUSCO analysis')
    # Reconstruct the genome. can change name of file to actual fasta file?
    reconstruct_dataset(size=60,
                        directory="./Genome",
                        output_file="../{}.fa".format(message),
                        mode="Genome",
                        seqID="0",
                        region="0")
    input_file = os.path.abspath('../{}.fa'.format(message))
    if (lineage != "0"):
        buscoReport(input_file, lineage, message)
    else:
        buscoReport(input_file=input_file, output_folder=message)

# Create dictionary about the genomic sequences
# {seqKey:[seqID,lenght,contig_count,N_count,[{file:vcf},{file:gff},{file:sam}]]}
genome_report = report_repo()

# Print the information of the dictionary in form of a table
# Loop through the sequences
for seqKey in genome_report.keys():
Пример #5
0
# If the there are no files to update, inform the user
update_inform_user(ToUpdate)

###############################################
# PART 1. RECONSTRUCTION OF REPOSITORY DATA #
###############################################

# Inform the user
print("\n\t*PART I. RECONSTRUCTION OF REPOSITORY DATA.*")
print("\t {}".format(str(datetime.datetime.now())))
# Create temporary directory
os.mkdir("./temporary_directory")
# Reconstruct the necessary datasets. First the old genome.
reconstruct_dataset(size=60,
                    directory="./Genome",
                    output_file="./temporary_directory/genome_old.fa",
                    mode="Genome",
                    seqID="0",
                    region="0")
# Reconstruct all the files related to the variants and annotation datasets
reconstruct_annotation_variants(ToUpdate)

##############################################
# PART 2. OBTAIN THE ALIGNMENT INFORMATION #
##############################################

# Obtain the information of the alignment between both assemblies.
# Determine the alignment pickle
alignment_pickle = obtain_alignment_pickle(
    "./temporary_directory/genome_old.fa", new_assembly)
variables = obtain_variables(alignment_pickle)
store_variables(variables=variables, alignment_pickle=alignment_pickle)