def build_serialize_library(FASTQFILE_PATH): logger.info("will rebuild library") read_library = build_read_library(FASTQFILE_PATH) logger.info("Will save %d items", len(read_library['N'])+len(read_library['C'])) packed_docs = msgpack.packb(read_library, default=lambda x: x.__dict__) logger.info("Packed to %d chars", len(packed_docs)) get_or_create_dir("data") get_or_create_dir("data/seq") tgt_file = "data/seq/"+experiment_name+"_%s_%d.packb" % ((int(time.time())), len(read_library)) with open(tgt_file, "w") as f: f.write(packed_docs) logger.info("Serialized to file %s" % tgt_file)
def build_serialize_library(): logger.info("will rebuild library") read_library = build_read_library() logger.info("Will save %d items", len(read_library['N'])+len(read_library['C'])) packed_docs = msgpack.packb(read_library, default=lambda x: x.__dict__) logger.info("Packed to %d chars", len(packed_docs)) get_or_create_dir("data") get_or_create_dir("data/seq") tgt_file = "data/seq/all_pool_trimmed0.1_%s_%d.packb" % ((int(time.time())), len(read_library)) with open(tgt_file, "w") as f: f.write(packed_docs) logger.info("Serialized to file %s" % tgt_file)
def build_serialize_library(FASTQFILE_PATH): logger.info("will rebuild library") read_library = build_read_library(FASTQFILE_PATH) logger.info("Will save %d items", len(read_library)) packed_docs = msgpack.packb(read_library, default=lambda x: x.__dict__) logger.info("Packed to %d chars", len(packed_docs)) get_or_create_dir("data") get_or_create_dir("data/seq") tgt_file = "data/seq/"+experiment_name+"_%s_%d.packb" % ((int(time.time())), len(read_library)) with open(tgt_file, "w") as f: f.write(packed_docs) logger.info("Serialized to file %s" % tgt_file)
def process_sample(kmer_length, min_support_percentage, n_permutations, sample_key=None, c_fastq_file=None, n_fastq_file=None, destination_directory=".", export_gml=False): # g_ref construction logger.info("Will build reference graph with k==%d", kmer_length) g_ref = RG.ref_constructor(kmer_length) # g_ind construction fastq = [c_fastq_file, n_fastq_file] fastq = [f for f in fastq if f] logger.info("Will build sample graph for %s with k==%d and minimum support (percentage) = %d", fastq, kmer_length, min_support_percentage) g_test = IG(fastq, kmer_length) g_test.graph_cleaned_init(min_support_percentage) # .dbgclean creation # Is there cycles ? if list(nx.simple_cycles(g_test.dbgclean)): if kmer_length > 50: logger.info("There are always cycle(s) with k==50...exiting") sys.exit(0) # Check non depassement valeur limite de k return process_sample(kmer_length=kmer_length+1,sample_key=sample_key,c_fastq_file=c_fastq_file,n_fastq_file=n_fastq_file, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml) # Some prints for stats dir_stat = get_or_create_dir("output/statistics") # graph stat graph_stat_file = open(dir_stat+"/graph_stat_file"+sample_key+".tsv", 'w') graph_stat_file.write( "%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d"%( kmer_length, g_ref.size(), sample_key, g_test.coverage['C'], g_test.coverage['N'], g_test.dbg.size(), g_test.dbgclean.size(), g_test.dbg.in_degree().values().count(0), g_test.dbg.out_degree().values().count(0), g_test.dbgclean.in_degree().values().count(0), g_test.dbgclean.out_degree().values().count(0) )) # kmer stat kmer_stat_file = open(dir_stat+"/kmer_stat_file"+sample_key+".tsv", 'w') for node_print in g_test.dbg.nodes(): fragment_print = "".join(g_test.dbg.node[node_print]['fragment']) reads_print = len(g_test.dbg.node[node_print]['read_list_n']) kmer_stat_file.write( "%s\t%s\t%s\t%d\n"%( sample_key, node_print, fragment_print, reads_print, )) g_test.graph_rmRefEdges_init(g_test.dbgclean, g_ref) # .dbg_refrm creation # For visualisation graph_name = "G_%s_" % sample_key if export_gml: logger.info("Will save viz graph for %s with k==%d", fastq, kmer_length) get_or_create_dir(destination_directory) G_ref_merge = VISU.merge_reference_graph(g_ref.copy()) G_ref_visu = VISU.reference_graph_visualization_formatting(g_ref.copy()) G_ref_merge_visu = VISU.reference_graph_merged_visualization_formatting(G_ref_merge.copy()) nx.write_gml(G_ref_visu,destination_directory+"/G_ref_visu"+str(kmer_length)+".gml") nx.write_gml(G_ref_merge_visu,destination_directory+"/G_ref_merge_visu"+str(kmer_length)+".gml") g_test_visu = VISU.individu_graph_visualization_formating(g_test.dbg.copy(), g_ref.copy()) g_test_clean_visu = VISU.individu_graph_visualization_formating(g_test.dbgclean.copy(), g_ref.copy()) cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage nx.write_gml(g_test_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml") nx.write_gml(g_test_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml") # Graph merged logger.info("Will merge graph for %s with k==%d", fastq, kmer_length) g_test_merged = VISU.merge_individu_graph(g_test.dbg.copy(), g_ref.copy()) g_test_merged_visu = VISU.individu_graph_merged_visualization_formating(g_test_merged.copy(), g_ref.copy()) merged_graph_name = "G_%s_merged_" % sample_key nx.write_gml(g_test_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml") g_test_clean_merged = VISU.merge_individu_graph(g_test.dbgclean.copy(), g_ref.copy()) g_test_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(g_test_clean_merged.copy(), g_ref.copy()) merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage nx.write_gml(g_test_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml") # .alteration_list creation g_test.alteration_list_init(g_ref, kmer_length,min_support_percentage) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers=set() for an_alt in g_test.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_test.coverage, kmer_length,restrict_to=all_possible_kmers) for i_alteration in range(0, len(g_test.alteration_list)): g_random_data = g_random.check_path(g_test.alteration_list[i_alteration].reference_path, g_test.alteration_list[i_alteration].alternative_path, g_test.alteration_list[i_alteration].min_coverage) g_test.alteration_list[i_alteration].random_ratio_list.append(g_random_data[0]) g_test.alteration_list[i_alteration].random_reference_count_list.append(g_random_data[1]) g_test.alteration_list[i_alteration].random_alternative_count_list.append(g_random_data[2]) logger.info("Will generate p-values") for i_alteration in range(0, len(g_test.alteration_list)): g_test.alteration_list[i_alteration].pvalue_init() g_test.significant_alteration_list_init() # If more than one significant alteration, check if they are not in "spike" (en épis) if len(g_test.significant_alteration_list) > 1: g_test.multiple_alternative_path_filter() ## Stat # graph stat alt_stat_file = open(dir_stat+"/alt_stat_file"+sample_key+".tsv", 'w') for i_alteration in range(0, len(g_test.significant_alteration_list)): if g_test.significant_alteration_list[i_alteration].pvalue_ratio <= 1: # print "%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f" % ( # alt_stat_file.write("%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%s" % ( alt_stat_file.write("%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s" % ( i_alteration+1, sample_key, g_test.coverage['C'], g_test.coverage['N'], g_test.significant_alteration_list[i_alteration].reference_sequence, g_test.significant_alteration_list[i_alteration].alternative_sequence, g_test.significant_alteration_list[i_alteration].reference_read_count, g_test.significant_alteration_list[i_alteration].alternative_read_count, g_test.significant_alteration_list[i_alteration].ratio_read_count, g_test.significant_alteration_list[i_alteration].pvalue_ratio, # g_test.significant_alteration_list[i_alteration].zscore, "\t".join(map(str,g_test.significant_alteration_list[i_alteration].random_ratio_list)) )) ### MICADo + ### ANNO.alteration_list_to_transcrit_mutation(g_test,g_ref)
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None, destination_directory=".", export_gml=False, output_results=None): if experiment_name == "TP53": from randomreadsgraph_TP53 import RandomReadsGraph as RRG else: from randomreadsgraph import RandomReadsGraph as RRG # g_reference construction logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file) g_reference = RG(kmer_length, fasta_file, snp_file) # Is there cycles in reference graph? if list(nx.simple_cycles(g_reference.dbg)): if kmer_length > 70: logger.info("There are always cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length) return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results) # g_patient construction logger.info("Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage) fastq_files = fastq_files.split(",") g_patient = PG(fastq_files, kmer_length) logger.info("Before cleaning: %d nodes", len(g_patient.dbg)) g_patient.graph_cleaned_init(min_support_percentage) logger.info("After cleaning: %d nodes", len(g_patient.dbgclean)) # Is there cycles in patient graph? if list(nx.simple_cycles(g_patient.dbgclean)): if kmer_length > 70: logger.info("There are still cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length) return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results) # Some prints for stats dir_stat = get_or_create_dir("output/statistics") # graph stat graph_stat_file = open(dir_stat + "/graph_stat_file" + sample_key + ".tsv", 'w') graph_stat_file.write( "%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" % ( kmer_length, g_reference.dbg.size(), sample_key, g_patient.coverage['total'], g_patient.dbg.size(), g_patient.dbgclean.size(), g_patient.dbg.in_degree().values().count(0), g_patient.dbg.out_degree().values().count(0), g_patient.dbgclean.in_degree().values().count(0), g_patient.dbgclean.out_degree().values().count(0) )) # kmer stat kmer_stat_file = open(dir_stat + "/kmer_stat_file" + sample_key + ".tsv", 'w') for node_print in g_patient.dbg.nodes(): fragment_print = ",".join(g_patient.dbg.node[node_print]['fastq_id']) reads_print = len(g_patient.dbg.node[node_print]['read_list_n']) kmer_stat_file.write( "%s\t%s\t%s\t%d\n" % ( sample_key, node_print, fragment_print, reads_print, )) # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation) g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg) # search for alternative paths in dbg_refrm (.alteration_list creation) g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers = set() for an_alt in g_patient.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers) for i_alteration in range(0, len(g_patient.alteration_list)): g_random_data = g_random.check_path(g_patient.alteration_list[i_alteration].reference_path, g_patient.alteration_list[i_alteration].alternative_path, g_patient.alteration_list[i_alteration].min_coverage) g_patient.alteration_list[i_alteration].random_ratio_list.append(g_random_data[0]) g_patient.alteration_list[i_alteration].random_reference_count_list.append(g_random_data[1]) g_patient.alteration_list[i_alteration].random_alternative_count_list.append(g_random_data[2]) logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list)) for i_alteration in range(0, len(g_patient.alteration_list)): g_patient.alteration_list[i_alteration].pvalue_init() g_patient.significant_alteration_list_init(p_value_threshold=p_value_threshold) # If more than one significant alteration, check if they are not in "spike" (en épis) if len(g_patient.significant_alteration_list) > 1: g_patient.multiple_alternative_path_filter() ## Stat # alteration stat alt_stat_file = open(dir_stat + "/alt_stat_file" + sample_key + ".tsv", 'w') for i_alteration in range(0, len(g_patient.alteration_list)): if g_patient.alteration_list[i_alteration].pvalue_ratio <= 1: alt_stat_file.write("%d\t%s\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\n" % ( i_alteration + 1, sample_key, g_patient.coverage['total'], g_patient.alteration_list[i_alteration].reference_sequence, g_patient.alteration_list[i_alteration].alternative_sequence, g_patient.alteration_list[i_alteration].reference_read_count, g_patient.alteration_list[i_alteration].alternative_read_count, g_patient.alteration_list[i_alteration].ratio_read_count, g_patient.alteration_list[i_alteration].pvalue_ratio, str(g_patient.alteration_list[i_alteration].zscore), "\t".join(map(str, g_patient.alteration_list[i_alteration].random_ratio_list)) )) # For visualisation graph_name = "G_%s_" % sample_key merged_graph_name = "G_%s_merged_" % sample_key cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage if export_gml: logger.info("Will save viz graph for %s with k==%d", sample_key, kmer_length) get_or_create_dir(destination_directory) # for the refrence graph g_reference_merge = VISU.merge_reference_graph(g_reference.dbg.copy()) g_reference_visu = VISU.reference_graph_visualization_formatting(g_reference.dbg.copy()) g_reference_merge_visu = VISU.reference_graph_merged_visualization_formatting(g_reference_merge.copy()) nx.write_gml(g_reference_visu, destination_directory + "/g_reference_visu" + str(kmer_length) + ".gml") nx.write_gml(g_reference_merge_visu, destination_directory + "/g_reference_merge_visu" + str(kmer_length) + ".gml") # for the patient graph g_patient_visu = VISU.individu_graph_visualization_formating(g_patient.dbg.copy(), g_reference.dbg.copy()) g_patient_clean_visu = VISU.individu_graph_visualization_formating(g_patient.dbgclean.copy(), g_reference.dbg.copy()) g_patient_merged = VISU.merge_individu_graph(g_patient.dbg.copy(), g_reference.dbg.copy()) g_patient_merged_visu = VISU.individu_graph_merged_visualization_formating(g_patient_merged.copy(), g_reference.dbg.copy()) g_patient_clean_merged = VISU.merge_individu_graph(g_patient.dbgclean.copy(), g_reference.dbg.copy()) g_patient_clean_merged_visu = VISU.individu_graph_merged_visualization_formating(g_patient_clean_merged.copy(), g_reference.dbg.copy()) nx.write_gml(g_patient_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml") nx.write_gml(g_patient_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml") nx.write_gml(g_patient_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml") nx.write_gml(g_patient_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml") # Annotation if experiment_name == "TP53": annotate_and_output_results(g_patient, g_reference, output_results) # SNP dir_stat = get_or_create_dir("output/snp") # graph stat graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w') for snp_id in g_reference.snp.keys(): if g_reference.snp[snp_id][1] in g_patient.dbgclean: if g_reference.snp[snp_id][0] in g_patient.dbgclean: graph_snp.write("%s\t%s\t%d\t%d\n" % ( sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n']))) else: graph_snp.write("%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, max_len, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None, output_results=None, disable_cycle_breaking=False): import seq_lib as seq_lib_module seq_lib_module.library_itit(experiment_name) # g_reference construction logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file) g_reference = RG(kmer_length, fasta_file, snp_file) # Is there cycles in reference graph? if list(nx.simple_cycles(g_reference.dbg)): if kmer_length >= 70: logger.info("There are always cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length+1) return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results, disable_cycle_breaking=disable_cycle_breaking) # g_patient construction logger.info("Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage) fastq_files = fastq_files.split(",") g_patient = PG(fastq_files, kmer_length) logger.info("Before cleaning: %d nodes", len(g_patient.dbg)) g_patient.graph_cleaned_init(min_support_percentage) logger.info("After cleaning: %d nodes", len(g_patient.dbgclean)) # Is there cycles in patient graph? if not disable_cycle_breaking and list(nx.simple_cycles(g_patient.dbgclean)): if kmer_length >= 70: logger.info("There are still cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length+1) return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results) # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation) g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg) # search for alternative paths in dbg_refrm (.alteration_list creation) g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage, max_len) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers = set() for an_alt in g_patient.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for _, _ in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers, seq_lib_module=seq_lib_module) for i in range(0, len(g_patient.alteration_list)): i_alteration = g_patient.alteration_list[i] ref_path = i_alteration.reference_path alt_path = i_alteration.alternative_path g_random_data = g_random.check_path(ref_path, alt_path, i_alteration.min_coverage) i_alteration.random_ratio_list.append(g_random_data[0]) i_alteration.random_reference_count_list.append(g_random_data[1]) i_alteration.random_alternative_count_list.append(g_random_data[2]) logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list)) for i in range(0, len(g_patient.alteration_list)): g_patient.alteration_list[i].pvalue_init() g_patient.significant_alteration_list_init(p_value_threshold=p_value_threshold) # Annotation annotate_and_output_results(g_patient, g_reference, output_results) # SNP dir_stat = get_or_create_dir("output/snp") graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w') for snp_id in g_reference.snp.keys(): if g_reference.snp[snp_id][1] in g_patient.dbgclean: if g_reference.snp[snp_id][0] in g_patient.dbgclean: graph_snp.write("%s\t%s\t%d\t%d\n" % ( sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n']))) else: graph_snp.write( "%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id][1]]['read_list_n'])))
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, max_len, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None, output_results=None, disable_cycle_breaking=False): import seq_lib as seq_lib_module seq_lib_module.library_itit(experiment_name) # g_reference construction logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file) g_reference = RG(kmer_length, fasta_file, snp_file) # Is there cycles in reference graph? if list(nx.simple_cycles(g_reference.dbg)): if kmer_length >= 70: logger.info("There are always cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length + 1) return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results, disable_cycle_breaking=disable_cycle_breaking) # g_patient construction logger.info( "Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage) fastq_files = fastq_files.split(",") g_patient = PG(fastq_files, kmer_length) logger.info("Before cleaning: %d nodes", len(g_patient.dbg)) g_patient.graph_cleaned_init(min_support_percentage) logger.info("After cleaning: %d nodes", len(g_patient.dbgclean)) # Is there cycles in patient graph? if not disable_cycle_breaking and list(nx.simple_cycles( g_patient.dbgclean)): if kmer_length >= 70: logger.info("There are still cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length + 1) return process_sample(kmer_length=kmer_length + 1, min_support_percentage=min_support_percentage, n_permutations=n_permutations, p_value_threshold=p_value_threshold, max_len=max_len, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, output_results=output_results) # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation) g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg) # search for alternative paths in dbg_refrm (.alteration_list creation) g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage, max_len) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers = set() for an_alt in g_patient.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for _, _ in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers, seq_lib_module=seq_lib_module) for i in range(0, len(g_patient.alteration_list)): i_alteration = g_patient.alteration_list[i] ref_path = i_alteration.reference_path alt_path = i_alteration.alternative_path g_random_data = g_random.check_path(ref_path, alt_path, i_alteration.min_coverage) i_alteration.random_ratio_list.append(g_random_data[0]) i_alteration.random_reference_count_list.append(g_random_data[1]) i_alteration.random_alternative_count_list.append(g_random_data[2]) logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list)) for i in range(0, len(g_patient.alteration_list)): g_patient.alteration_list[i].pvalue_init() g_patient.significant_alteration_list_init( p_value_threshold=p_value_threshold) # Annotation annotate_and_output_results(g_patient, g_reference, output_results) # SNP dir_stat = get_or_create_dir("output/snp") graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w') for snp_id in g_reference.snp.keys(): if g_reference.snp[snp_id][1] in g_patient.dbgclean: if g_reference.snp[snp_id][0] in g_patient.dbgclean: graph_snp.write("%s\t%s\t%d\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id] [0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id] [1]]['read_list_n']))) else: graph_snp.write("%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id] [1]]['read_list_n'])))
def process_sample(kmer_length, min_support_percentage, n_permutations, p_value_threshold, sample_key=None, fastq_files=None, fasta_file=None, snp_file=None, experiment_name=None, destination_directory=".", export_gml=False, output_results=None): if experiment_name == "TP53": from randomreadsgraph_TP53 import RandomReadsGraph as RRG else: from randomreadsgraph import RandomReadsGraph as RRG # g_reference construction logger.info("Will build reference graph with k==%d and fasta=%s & snp=%s", kmer_length, fasta_file, snp_file) g_reference = RG(kmer_length, fasta_file, snp_file) # Is there cycles in reference graph? if list(nx.simple_cycles(g_reference.dbg)): if kmer_length > 70: logger.info("There are always cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Reference graph] Increasing k to %d to remove cycles", kmer_length) return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=fastq_files, fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results) # g_patient construction logger.info( "Will build patient graph for %s with k==%d and minimum support = %dpct", fastq_files, kmer_length, min_support_percentage) fastq_files = fastq_files.split(",") g_patient = PG(fastq_files, kmer_length) logger.info("Before cleaning: %d nodes", len(g_patient.dbg)) g_patient.graph_cleaned_init(min_support_percentage) logger.info("After cleaning: %d nodes", len(g_patient.dbgclean)) # Is there cycles in patient graph? if list(nx.simple_cycles(g_patient.dbgclean)): if kmer_length > 70: logger.info("There are still cycle(s) with k==70...exiting") sys.exit(0) # Check non depassement valeur limite de k logger.info("[Sample graph] Increasing k to %d to remove cycles", kmer_length) return process_sample(kmer_length=kmer_length + 1, sample_key=sample_key, fastq_files=",".join(fastq_files), fasta_file=fasta_file, snp_file=snp_file, experiment_name=experiment_name, min_support_percentage=min_support_percentage, n_permutations=n_permutations, destination_directory=destination_directory, export_gml=export_gml, p_value_threshold=p_value_threshold, output_results=output_results) # Some prints for stats dir_stat = get_or_create_dir("output/statistics") # graph stat graph_stat_file = open(dir_stat + "/graph_stat_file" + sample_key + ".tsv", 'w') graph_stat_file.write("%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" % (kmer_length, g_reference.dbg.size(), sample_key, g_patient.coverage['total'], g_patient.dbg.size(), g_patient.dbgclean.size(), g_patient.dbg.in_degree().values().count(0), g_patient.dbg.out_degree().values().count(0), g_patient.dbgclean.in_degree().values().count(0), g_patient.dbgclean.out_degree().values().count(0))) # kmer stat kmer_stat_file = open(dir_stat + "/kmer_stat_file" + sample_key + ".tsv", 'w') for node_print in g_patient.dbg.nodes(): fragment_print = ",".join(g_patient.dbg.node[node_print]['fastq_id']) reads_print = len(g_patient.dbg.node[node_print]['read_list_n']) kmer_stat_file.write("%s\t%s\t%s\t%d\n" % ( sample_key, node_print, fragment_print, reads_print, )) # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation) g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg) # search for alternative paths in dbg_refrm (.alteration_list creation) g_patient.alteration_list_init(g_reference.dbg, kmer_length, min_support_percentage) ### Permutation test ### logger.info("Will create random graphs") all_possible_kmers = set() for an_alt in g_patient.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) for i, j in time_iterator(range(0, n_permutations), logger, msg_prefix="permuting"): g_random = RRG(g_patient.coverage, kmer_length, restrict_to=all_possible_kmers) for i_alteration in range(0, len(g_patient.alteration_list)): g_random_data = g_random.check_path( g_patient.alteration_list[i_alteration].reference_path, g_patient.alteration_list[i_alteration].alternative_path, g_patient.alteration_list[i_alteration].min_coverage) g_patient.alteration_list[i_alteration].random_ratio_list.append( g_random_data[0]) g_patient.alteration_list[ i_alteration].random_reference_count_list.append( g_random_data[1]) g_patient.alteration_list[ i_alteration].random_alternative_count_list.append( g_random_data[2]) logger.info("Will generate p-values for %d possible alterations", len(g_patient.alteration_list)) for i_alteration in range(0, len(g_patient.alteration_list)): g_patient.alteration_list[i_alteration].pvalue_init() g_patient.significant_alteration_list_init( p_value_threshold=p_value_threshold) # If more than one significant alteration, check if they are not in "spike" (en épis) if len(g_patient.significant_alteration_list) > 1: g_patient.multiple_alternative_path_filter() ## Stat # alteration stat alt_stat_file = open(dir_stat + "/alt_stat_file" + sample_key + ".tsv", 'w') for i_alteration in range(0, len(g_patient.alteration_list)): if g_patient.alteration_list[i_alteration].pvalue_ratio <= 1: alt_stat_file.write( "%d\t%s\t%d\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\n" % (i_alteration + 1, sample_key, g_patient.coverage['total'], g_patient.alteration_list[i_alteration].reference_sequence, g_patient.alteration_list[i_alteration].alternative_sequence, g_patient.alteration_list[i_alteration].reference_read_count, g_patient.alteration_list[i_alteration]. alternative_read_count, g_patient.alteration_list[i_alteration].ratio_read_count, g_patient.alteration_list[i_alteration].pvalue_ratio, str(g_patient.alteration_list[i_alteration].zscore), "\t".join( map( str, g_patient.alteration_list[i_alteration]. random_ratio_list)))) # For visualisation graph_name = "G_%s_" % sample_key merged_graph_name = "G_%s_merged_" % sample_key cleaned_graph_name = graph_name + "clean%d_" % min_support_percentage merged_cleaned_graph_name = graph_name + "clean%d_merged_" % min_support_percentage if export_gml: logger.info("Will save viz graph for %s with k==%d", sample_key, kmer_length) get_or_create_dir(destination_directory) # for the refrence graph g_reference_merge = VISU.merge_reference_graph(g_reference.dbg.copy()) g_reference_visu = VISU.reference_graph_visualization_formatting( g_reference.dbg.copy()) g_reference_merge_visu = VISU.reference_graph_merged_visualization_formatting( g_reference_merge.copy()) nx.write_gml( g_reference_visu, destination_directory + "/g_reference_visu" + str(kmer_length) + ".gml") nx.write_gml( g_reference_merge_visu, destination_directory + "/g_reference_merge_visu" + str(kmer_length) + ".gml") # for the patient graph g_patient_visu = VISU.individu_graph_visualization_formating( g_patient.dbg.copy(), g_reference.dbg.copy()) g_patient_clean_visu = VISU.individu_graph_visualization_formating( g_patient.dbgclean.copy(), g_reference.dbg.copy()) g_patient_merged = VISU.merge_individu_graph(g_patient.dbg.copy(), g_reference.dbg.copy()) g_patient_merged_visu = VISU.individu_graph_merged_visualization_formating( g_patient_merged.copy(), g_reference.dbg.copy()) g_patient_clean_merged = VISU.merge_individu_graph( g_patient.dbgclean.copy(), g_reference.dbg.copy()) g_patient_clean_merged_visu = VISU.individu_graph_merged_visualization_formating( g_patient_clean_merged.copy(), g_reference.dbg.copy()) nx.write_gml( g_patient_visu, destination_directory + "/" + graph_name + str(kmer_length) + ".gml") nx.write_gml( g_patient_clean_visu, destination_directory + "/" + cleaned_graph_name + str(kmer_length) + ".gml") nx.write_gml( g_patient_merged_visu, destination_directory + "/" + merged_graph_name + str(kmer_length) + ".gml") nx.write_gml( g_patient_clean_merged_visu, destination_directory + "/" + merged_cleaned_graph_name + str(kmer_length) + ".gml") # Annotation if experiment_name == "TP53": annotate_and_output_results(g_patient, g_reference, output_results) # SNP dir_stat = get_or_create_dir("output/snp") # graph stat graph_snp = open(dir_stat + "/snp_" + sample_key + ".tsv", 'w') for snp_id in g_reference.snp.keys(): if g_reference.snp[snp_id][1] in g_patient.dbgclean: if g_reference.snp[snp_id][0] in g_patient.dbgclean: graph_snp.write("%s\t%s\t%d\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id] [0]]['read_list_n']), len(g_patient.dbg.node[g_reference.snp[snp_id] [1]]['read_list_n']))) else: graph_snp.write("%s\t%s\t0\t%d\n" % (sample_key, snp_id, len(g_patient.dbg.node[g_reference.snp[snp_id] [1]]['read_list_n'])))