rearrangement = False # deletion = Interval("chr" + chr_num, start, end) write_all_chrms_in_file = False #set True if you want write training file consisting several chromosomes fill_empty_contacts = False #set True if you want use all contacts in region, without empty contacts logging.getLogger(__name__).debug("Using input folder " + input_folder) # Read contacts data genome = fastaReader(args['path_to_genome'], useOnlyChromosomes=[chromosome]) #str(chr_num)]) genome = genome.read_data() print(genome.data) now = datetime.datetime.now() params.contacts_reader = hicReader(fname=input_folder + "/" + cell_type + "/" + hic_name, genome=genome, binsize=params.binsize) params.contacts_reader = params.contacts_reader.read_data( fill_empty_contacts=fill_empty_contacts, noDump=False) if params.use_only_contacts_with_CTCF == "cont_with_CTCF": params.proportion = 1 params.contacts_reader.use_contacts_with_CTCF(CTCFfile=input_folder+"/" + cell_type+"/CTCF/"+CTCF_file_name, maxdist=params.maxdist, proportion=params.proportion, keep_only_orient=params.keep_only_orient, CTCForientfile=input_folder + "/" + cell_type + \ "/CTCF/"+CTCF_file_name+"-orient.bed") params.use_only_contacts_with_CTCF += str( params.contacts_reader.conts_with_ctcf) #make deletion
params.sample_size = 100 # how many contacts write to file params.conttype = conttype params.max_cpus = 11 params.keep_only_orient = False # set True if you want use only CTCF with orient #params.use_only_contacts_with_CTCF = "cont_with_CTCF" # "cont_with_CTCF" params.use_only_contacts_with_CTCF = "no" # use this option to change proportion # of contacts with nearest ctcf sites in training datasets write_all_chrms_in_file = True # set True if you have train with few chromosomes. Need for writing different chromosomes in the same file fill_empty_contacts = False logging.getLogger(__name__).debug("Using input folder " + input_folder) # Read contacts data params.contacts_reader = ContactsReader() contacts_files = [] # set path to the coefficient file and to contacts files # contacts file format: bin_start--bin_end--contact_count [ contacts_files.append(input_folder + "chr" + chr + ".5MB.K562." + params.conttype) for chr in chr_nums ] params.contacts_reader.read_files( contacts_files, coeff_fname=input_folder + "coefficient." + cell_type + ".25000.txt", max_cpus=params.max_cpus, fill_empty_contacts=fill_empty_contacts, maxdist=params.maxdist, expected_binsize=25000)
write_all_chrms_in_file = False #set True if you want write training file consisting several chromosomes fill_empty_contacts = True #set True if you want use all contacts in region, without empty contacts logging.getLogger(__name__).debug("Using input folder " + input_folder) # Read contacts data genome = fastaReader(input_folder + "sequence/hg38/hg38.fa", name="hg38", useOnlyChromosomes=["chr3"]) genome = genome.read_data() # print(genome) # print(genome.data.keys()) now = datetime.datetime.now() params.contacts_reader = hicReader(fname=input_folder + "H1/4DNFI2TK7L2F.hic", genome=genome, binsize=1000) # params.contacts_reader = hicReader(fname=input_folder + "H1/control.chr4.50KBhic", genome=genome, binsize=1000) params.contacts_reader = params.contacts_reader.read_data() if params.use_only_contacts_with_CTCF == "cont_with_CTCF": params.proportion = 1 params.contacts_reader.use_contacts_with_CTCF( CTCFfile=input_folder + "H1/CTCF/CTCF_H1_conservative_peaks.bed.gz", maxdist=params.maxdist, proportion=params.proportion, keep_only_orient=params.keep_only_orient, CTCForientfile=input_folder + "H1/CTCF/CTCF_H1_conservative_peaks_orient.bed") params.use_only_contacts_with_CTCF += str(