conttype = sys.argv[2]  # contacts.gz or oe.gz

# chr_num="12,13,14"
# conttype = "contacts.gz"

logging.basicConfig(level=logging.DEBUG)

if __name__ == '__main__':  # Requiered for parallelization, at least on Windows
    for conttype in [conttype]:
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                            datefmt='%I:%M:%S',
                            level=logging.DEBUG)
        input_folder = "input/K562/"
        output_folder = "output/K562/"
        cell_type = "K562"
        params = Parameters()
        params.window_size = 25000  # region around contact to be binned for predictors
        params.mindist = 50001  # minimum distance between contacting regions
        params.maxdist = 1500000  # maximum distance between contacting regions
        params.sample_size = 100  # how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient = False  # set True if you want use only CTCF with orient
        #params.use_only_contacts_with_CTCF = "cont_with_CTCF"   # "cont_with_CTCF"
        params.use_only_contacts_with_CTCF = "no"
        # use this option to change proportion
        # of contacts with nearest ctcf sites in training datasets

        write_all_chrms_in_file = True  # set True if you have train with few chromosomes. Need for writing different chromosomes in the same file

        fill_empty_contacts = False
Пример #2
0
conttype = sys.argv[2]

# chr_num="12"
# conttype = "contacts.gz"

if __name__ == '__main__': #Requered for parallization, at least on Windows
    #,"chr10", "chr1"]:
    for conttype in [conttype]:
        print("hello")
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG)

        input_folder ="/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/input/K562/"
        output_folder = "/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/out/K562/5KB/all_predictors/"
        cell_type="K562"
        lengths_dict = {'chr1': 1494930, 'chr3': 609806, 'chr5': 518646, 'chr7': 682860, 'chr11': 726290, 'chr13': 115324}
        params = Parameters()
        params.binsize = 5000 #sequence resolution of contacts data. Use for finding of normalized coefficient file
        params.window_size = params.binsize #region around contact to be binned for predictors. Usually equal to binsize
        params.mindist = params.binsize*2+1 #minimum distance between contacting regions
        params.maxdist = 1500000
        params.sample_size = 250000 #how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient=False
        params.use_only_contacts_with_CTCF = "all_cont"#"cont_with_CTCF"#"#"all_cont"#"cont_with_CTCF "

        write_all_chrms_in_file=False #set True if you want write training file consisting several chromosomes
        fill_empty_contacts = False #set True if you want use all contacts in region, without empty contacts

        logging.getLogger(__name__).debug("Using input folder "+input_folder)
Пример #3
0
import pandas as pd
import os

if __name__ == '__main__':  #Requered for parallization, at least on Windows
    #,"chr10", "chr1"]:
    for conttype in ["contacts.gz", "oe.gz"]:
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                            datefmt='%I:%M:%S',
                            level=logging.DEBUG)

        input_folder = "input/Hepat/"
        #output_folder = "D:/Users/Polina/3Dpredictor/"
        output_folder = "out/Hepat/validating_chrms/"
        #input_folder =  "input"

        params = Parameters()
        params.window_size = 25000  #region around contact to be binned for predictors
        #params.small_window_size = 12500 #region  around contact ancors to be considered as cis
        params.mindist = 50001  #minimum distance between contacting regions
        #params.maxdist = params.window_size #max distance between contacting regions
        params.maxdist = 1500000
        #params.binsize = 20000 #when binning regions with predictors, use this binsize
        params.sample_size = 25000  #how many contacts write to file
        #params.conttype = "oe.gz"
        params.conttype = conttype
        params.max_cpus = 12

        logging.getLogger(__name__).debug("Using input folder " + input_folder)

        #Read contacts data
        params.contacts_reader = ContactsReader()
Пример #4
0
if __name__ == '__main__':
    parser = createParser()
    namespace = parser.parse_args(sys.argv[1:])

    RNA_seq_file = namespace.RNA_seq_file
    CTCF_file = namespace.CTCF_file
    CTCF_orient_file = namespace.CTCF_orient_file
    chr = namespace.chr
    interval_start = namespace.interval_start
    interval_end = namespace.interval_end
    resolution = int(namespace.resolution)
    model_path = namespace.model_path
    out_file = namespace.out_file

    params = Parameters()
    params.window_size = int(
        resolution)  #region around contact to be binned for predictors
    params.mindist = int(
        resolution) * 2 + 1  #minimum distance between contacting regions
    params.maxdist = 1500000  #maximum distance between contacting regions
    params.max_cpus = 1
    # params.keep_only_orient=False #set True if you want use only CTCF with orient
    params.multiprocessing = False
    # params.write_to_file = False
    # Read CTCF data
    # CTCF_file format: ENCODE narrow peak
    # CTCF_orient_file format: chr--start--end--name--score--strand
    logging.info('create CTCF_PG')
    # set path to the CTCF chip-seq file:
    params.ctcf_reader = ChiPSeqReader(CTCF_file, name="CTCF")
conttype = sys.argv[2]

# chr_num="12"
# conttype = "contacts.gz"

if __name__ == '__main__': #Requered for parallization, at least on Windows
    #,"chr10", "chr1"]:
    for conttype in [conttype]:
        print("hello")
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG)

        input_folder ="/mnt/scratch/ws/psbelokopytova/201907031108polinaB/3DPredictor/input/NPC/"
        #output_folder = "D:/Users/Polina/3Dpredictor/"
        output_folder = "/mnt/scratch/ws/psbelokopytova/201907031108polinaB/3DPredictor/out/NPC/5KB/"
        cell_type="NPC"
        params = Parameters()
        params.window_size = 5000 #region around contact to be binned for predictors
        params.mindist = 10001 #minimum distance between contacting regions
        params.maxdist = 1500000
        params.sample_size = 1 #how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient=False
        params.use_only_contacts_with_CTCF = "cont_with_CTCF"#"all_cont"#"#"all_cont"#"cont_with_CTCF "

        write_all_chrms_in_file=True
        fill_empty_contacts = False


        logging.getLogger(__name__).debug("Using input folder "+input_folder)
Пример #6
0
from DataGenerator import generate_data
from PredictorGenerators import E1PredictorGenerator,ChipSeqPredictorGenerator, \
                            SmallChipSeqPredictorGenerator,SmallE1PredictorGenerator, SitesOrientPredictorGenerator, OrientBlocksPredictorGenerator, \
                            SitesOnlyOrientPredictorGenerator
start_time = time.time()

logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                    datefmt='%I:%M:%S',
                    level=logging.DEBUG)

input_folder = "/home/evgeniy/asp/3Dpredictor/input/"
#output_folder = "D:/Users/Polina/3Dpredictor/"
output_folder = "/home/evgeniy/asp/3Dpredictor/out/"
#input_folder =  "input"

params = Parameters()
params.window_size = 25000  #region around contact to be binned for predictors
#params.small_window_size = 12500 #region  around contact ancors to be considered as cis
params.mindist = 50001  #minimum distance between contacting regions
#params.maxdist = params.window_size #max distance between contacting regions
params.maxdist = 1000000
#params.binsize = 20000 #when binning regions with predictors, use this binsize
params.sample_size = 500  #how many contacts write to file
params.conttype = "oe.gz"

training_file_name = "2018-09-23-trainingOrient.RandOnChr1." + str(
    params) + ".txt"
validation_file_name = "validatingOrient." + str(params) + ".txt"
logging.getLogger(__name__).debug("Using input folder " + input_folder)

#Read contacts data
    input_folder = args['input_folder']
    output_folder = args['output_folder']
    cell_type = args['cell_type']
    start = int(args['start'])
    end = int(args['end'])
    chromosome = 'chr' + args['chr_num']
    hic_name = args['hic_name']
    CTCF_file_name = args['CTCF_file_name']
    #RNA_file_name = args['RNA_file_name']

    # validate_chrs = args['validate_chrs'].split(",")
    # for chr in validate_chrs:
    #     chr = int(chr)

    params = Parameters()
    params.binsize = int(
        args['binsize']
    )  #sequence resolution of contacts data. Use for finding of normalized coefficient file
    params.window_size = params.binsize  #region around contact to be binned for predictors. Usually equal to binsize
    params.mindist = params.binsize * 2 + 1  #minimum distance between contacting regions
    params.maxdist = 1500000
    # params.sample_size = end - start
    params.sample_size = 2  #how many contacts write to file
    #params.conttype = conttype
    params.max_cpus = int(args['max_cpus'])
    params.keep_only_orient = False
    params.use_only_contacts_with_CTCF = "all_cont"  #"all_cont" or "cont_with_CTCF"
    rearrangement = False

    # deletion = Interval("chr" + chr_num, start, end)
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                            datefmt='%I:%M:%S',
                            level=logging.DEBUG)

        input_folder = "/mnt/scratch/ws/psbelokopytova/202002281332polina_data_2019/3DPredictor/input/"
        output_folder = "/mnt/scratch/ws/psbelokopytova/202002281332polina_data_2019/3DPredictor/out/mast_cells/"
        cell_type = "mast_cells"
        lengths_dict = {
            'chr1': 1494930,
            'chr3': 609806,
            'chr5': 518646,
            'chr7': 682860,
            'chr11': 726290,
            'chr13': 115324
        }
        params = Parameters()
        params.binsize = 1000  #sequence resolution of contacts data. Use for finding of normalized coefficient file
        params.window_size = params.binsize  #region around contact to be binned for predictors. Usually equal to binsize
        params.mindist = params.binsize * 2 + 1  #minimum distance between contacting regions
        params.maxdist = 1500000
        params.sample_size = 500000  #how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient = False
        params.use_only_contacts_with_CTCF = "all_cont"  #"all_cont"#"cont_with_CTCF"#"#"all_cont"#"cont_with_CTCF "

        write_all_chrms_in_file = False  #set True if you want write training file consisting several chromosomes
        fill_empty_contacts = True  #set True if you want use all contacts in region, without empty contacts

        logging.getLogger(__name__).debug("Using input folder " + input_folder)
Пример #9
0
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                            datefmt='%I:%M:%S',
                            level=logging.DEBUG)

        input_folder = "/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/input/"
        output_folder = "/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/out/H1/"
        cell_type = "K562"
        lengths_dict = {
            'chr1': 1494930,
            'chr3': 609806,
            'chr5': 518646,
            'chr7': 682860,
            'chr11': 726290,
            'chr13': 115324
        }
        params = Parameters()
        params.binsize = 1000  #sequence resolution of contacts data. Use for finding of normalized coefficient file
        params.window_size = params.binsize  #region around contact to be binned for predictors. Usually equal to binsize
        params.mindist = params.binsize * 2 + 1  #minimum distance between contacting regions
        params.maxdist = 1500000
        params.sample_size = 500000  #how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient = False
        params.use_only_contacts_with_CTCF = "all_cont"  #"all_cont"#"cont_with_CTCF"#"#"all_cont"#"cont_with_CTCF "

        write_all_chrms_in_file = False  #set True if you want write training file consisting several chromosomes
        fill_empty_contacts = False  #set True if you want use all contacts in region, without empty contacts

        logging.getLogger(__name__).debug("Using input folder " + input_folder)
Пример #10
0
    1]  #comma separated number of chromosomes for predictor generation
chr_nums = chr_num.split(",")
conttype = sys.argv[2]  #contacts.gz or oe.gz

# chr_num="12,13,14"
# conttype = "contacts.gz"

if __name__ == '__main__':  #Requiered for parallelization, at least on Windows
    for conttype in [conttype]:
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                            datefmt='%I:%M:%S',
                            level=logging.DEBUG)
        input_folder = "/input/K562/"
        output_folder = "/output/K562/"
        cell_type = "K562"
        params = Parameters()
        params.window_size = 5000  #region around contact to be binned for predictors
        params.mindist = 10001  #minimum distance between contacting regions
        params.maxdist = 1500000  #maximum distance between contacting regions
        params.sample_size = 30000  #how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient = False  #set True if you want use only CTCF with orient
        params.use_only_contacts_with_CTCF = "all_cont"  #"cont_with_CTCF"  #this option use for training to change proportion
        #of contacts with nearest ctcf sites
        write_all_chrms_in_file = True  #set True if you have train with few chromosomes. Need for writing different chromosomes in the same file

        fill_empty_contacts = False
        logging.getLogger(__name__).debug("Using input folder " + input_folder)

        #Read contacts data
Пример #11
0
conttype = "contacts.gz"  # contacts.gz or oe.gz

# chr_num="12,13,14"
# conttype = "contacts.gz"
logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                    datefmt='%I:%M:%S',
                    level=logging.DEBUG)
if __name__ == '__main__':  # Requiered for parallelization, at least on Windows
    for conttype in [conttype]:
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                            datefmt='%I:%M:%S',
                            level=logging.DEBUG)
        input_folder = path1 + "/input/chr19_mm10/"
        output_folder = path1 + "/output/chr19_mm10/"
        cell_type = "NPC"
        params = Parameters()
        params.window_size = 25000  # region around contact to be binned for predictors
        params.mindist = 50001  # minimum distance between contacting regions
        params.maxdist = 1500000  # maximum distance between contacting regions
        params.sample_size = 1  # how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient = False  # set True if you want use only CTCF with orient
        params.use_only_contacts_with_CTCF = "all_cont"  # "cont_with_CTCF"  #this option use for training to change proportion
        # of contacts with nearest ctcf sites
        write_all_chrms_in_file = False  # set True if you have train with few chromosomes. Need for writing different chromosomes in the same file

        fill_empty_contacts = False
        logging.getLogger(__name__).debug("Using input folder " + input_folder)

        # Read contacts data
Пример #12
0
import pandas as pd
import os

if __name__ == '__main__':  #Requered for parallization, at least on Windows
    #,"chr10", "chr1"]:
    for conttype in ["contacts.gz", "oe.gz"]:
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                            datefmt='%I:%M:%S',
                            level=logging.DEBUG)

        input_folder = "input/GM12878/"
        #output_folder = "D:/Users/Polina/3Dpredictor/"
        output_folder = "out/GM12878/"
        #input_folder =  "input"

        params = Parameters()
        params.window_size = 25000  #region around contact to be binned for predictors
        #params.small_window_size = 12500 #region  around contact ancors to be considered as cis
        params.mindist = 50001  #minimum distance between contacting regions
        #params.maxdist = params.window_size #max distance between contacting regions
        params.maxdist = 1500000
        #params.binsize = 20000 #when binning regions with predictors, use this binsize
        params.sample_size = 250000  #how many contacts write to file
        #params.conttype = "oe.gz"
        params.conttype = conttype
        params.max_cpus = 12

        logging.getLogger(__name__).debug("Using input folder " + input_folder)

        #Read contacts data
        params.contacts_reader = ContactsReader()