def main():
    parser = argparse.ArgumentParser(description='Wrapper for HiC-Spector')
    parser.add_argument('--m1')
    parser.add_argument('--m2')
    parser.add_argument('--node_file')
    parser.add_argument('--num_evec', type=int, default=20)
    parser.add_argument('--out')
    args = parser.parse_args()

    nodes, nodes_idx, blacklist_nodes = processing.read_nodes_from_bed(
        args.node_file, 'NA')
    m1_csr = processing.construct_csr_matrix_from_data_and_nodes(
        args.m1, nodes, blacklist_nodes, False)
    m2_csr = processing.construct_csr_matrix_from_data_and_nodes(
        args.m2, nodes, blacklist_nodes, False)

    m1up = m1_csr
    m1down = m1up.transpose()
    m1down.setdiag(0)
    m1 = m1up + m1down

    m2up = m2_csr
    m2down = m2up.transpose()
    m2down.setdiag(0)
    m2 = m2up + m2down

    sys.stdout = open(args.out, 'w')
    get_reproducibility(m1, m2, args.num_evec)
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description='Compute RW transformation of 3D data')
    parser.add_argument('--datatype', default='hic')
    parser.add_argument('--m', type=str)
    parser.add_argument('--matrix_format',
                        type=str,
                        default='n1n2val',
                        help='c1n1c2n2val')
    parser.add_argument('--node_file', type=str)
    parser.add_argument('--remove_diagonal', action='store_true')
    parser.add_argument('--mname', type=str)
    parser.add_argument('--outdir', type=str, default='OUT')
    parser.add_argument('--outpref', type=str, default='outpref')
    parser.add_argument('--norm', type=str, default='uniform')
    parser.add_argument('--method', type=str, default='RandomWalks')
    parser.add_argument('--tmin', type=int, default=1)
    parser.add_argument('--tmax', type=int, default=3)
    parser.add_argument('--transition', action='store_true')
    parser.add_argument('--blacklist', default='NA')
    args = parser.parse_args()

    os.system('mkdir -p ' + args.outdir)
    nodes, nodes_idx, blacklist_nodes = processing.read_nodes_from_bed(
        args.node_file, args.blacklist)

    m = processing.construct_csr_matrix_from_data_and_nodes(
        args.m, nodes, blacklist_nodes, args.remove_diagonal)

    m_norm = data_operations.process_matrix(m, args.norm)

    mup = m_norm
    mdown = mup.transpose()
    mdown.setdiag(0)
    m_full = mup + mdown

    if args.transition:
        m_full = to_transition(m_full)

    outname = args.outdir + '/' + args.outpref
    for t in range(args.tmin, (args.tmax + 1)):
        if t == 1:
            rw = copy.deepcopy(m_full)
        else:
            rw = rw.dot(m_full)
        processing.write_matrix_from_csr_and_nodes(
            rw, nodes_idx, outname + '.rw_t' + str(t) + '.gz')
def main():
    parser = argparse.ArgumentParser(
        description='Compute reproducibility of 3D genome data')
    parser.add_argument('--datatype', default='hic')
    parser.add_argument(
        '--m1',
        type=str,
        default=
        '/srv/gsfs0/projects/kundaje/users/oursu/3d/LA/merged_nodups/processed_data/HIC014.res40000.byChr.chr21.gz'
    )
    parser.add_argument(
        '--m2',
        type=str,
        default=
        '/srv/gsfs0/projects/kundaje/users/oursu/3d/LA/merged_nodups/processed_data/HIC001.res40000.byChr.chr21.gz'
    )
    parser.add_argument('--matrix_format',
                        type=str,
                        default='n1n2val',
                        help='c1n1c2n2val')
    parser.add_argument(
        '--node_file',
        type=str,
        default=
        '/srv/gsfs0/projects/kundaje/users/oursu/3d/LA/merged_nodups/nodes/Nodes.w40000.chr21.gz'
    )
    parser.add_argument('--remove_diagonal', action='store_true')
    parser.add_argument('--m1name', type=str, default='HIC014')
    parser.add_argument('--m2name', type=str, default='HIC001')
    parser.add_argument('--outdir', type=str, default='OUT')
    parser.add_argument('--outpref', type=str, default='outpref')
    parser.add_argument('--m_subsample', type=str, default='lowest')
    parser.add_argument(
        '--concise_analysis',
        action='store_true',
        help=
        'Add this flag to only output the reproducibility score, and not perform the distance dependence analyses.'
    )
    parser.add_argument('--norm', type=str, default='uniform')
    parser.add_argument('--method', type=str, default='RandomWalks')
    parser.add_argument('--tmin', type=int, default=1)
    parser.add_argument('--tmax', type=int, default=3)
    parser.add_argument('--approximation', type=int, default=40000)
    parser.add_argument('--transition', action='store_true')
    parser.add_argument('--blacklist', default='NA')
    args = parser.parse_args()

    #write_arguments(args)

    os.system('mkdir -p ' + args.outdir)

    print "GenomeDISCO | " + strftime(
        "%c") + " | Starting reproducibility analysis"
    nodes, nodes_idx, blacklist_nodes = processing.read_nodes_from_bed(
        args.node_file, args.blacklist)

    print "GenomeDISCO | " + strftime("%c") + " | Loading contact maps"
    m1 = processing.construct_csr_matrix_from_data_and_nodes(
        args.m1, nodes, blacklist_nodes, args.remove_diagonal)
    m2 = processing.construct_csr_matrix_from_data_and_nodes(
        args.m2, nodes, blacklist_nodes, args.remove_diagonal)

    stats = {}
    stats[args.m1name] = {}
    stats[args.m2name] = {}
    stats[args.m1name]['depth'] = m1.sum()
    stats[args.m2name]['depth'] = m2.sum()

    m1_subsample = copy.deepcopy(m1)
    m2_subsample = copy.deepcopy(m2)
    if args.m_subsample != 'NA':
        if args.m_subsample == 'lowest':
            if stats[args.m1name]['depth'] >= stats[args.m2name]['depth']:
                m_subsample = copy.deepcopy(m2)
            if stats[args.m1name]['depth'] < stats[args.m2name]['depth']:
                m_subsample = copy.deepcopy(m1)
        else:
            m_subsample = processing.construct_csr_matrix_from_data_and_nodes(
                args.m_subsample, nodes, blacklist_nodes, args.remove_diagonal)
        print "GenomeDISCO | " + strftime(
            "%c") + " | Subsampling to the depth of " + args.m_subsample
        print "GenomeDISCO | " + strftime(
            "%c") + " | Subsampling depth = " + str(m_subsample.sum())
        desired_depth = m_subsample.sum()
        #desired_depth=156023
        if m1.sum() > desired_depth:
            m1_subsample = data_operations.subsample_to_depth(
                m1, desired_depth)
        if m2.sum() > desired_depth:
            m2_subsample = data_operations.subsample_to_depth(
                m2, desired_depth)

    stats[args.m1name]['subsampled_depth'] = m1_subsample.sum()
    stats[args.m2name]['subsampled_depth'] = m2_subsample.sum()

    print "GenomeDISCO | " + strftime(
        "%c") + ' | Normalizing with ' + args.norm
    m1_norm = data_operations.process_matrix(m1_subsample, args.norm)
    m2_norm = data_operations.process_matrix(m2_subsample, args.norm)

    if not args.concise_analysis:
        #distance dependence analysis
        print "GenomeDISCO | " + strftime(
            "%c") + " | Distance dependence analysis"
        if args.datatype == 'hic':
            m1dd = data_operations.get_distance_dep(m1_subsample)
            m2dd = data_operations.get_distance_dep(m2_subsample)
        if args.datatype == 'capturec':
            m1dd = data_operations.get_distance_dep_using_nodes_capturec(
                m1_subsample, nodes, nodes_idx, args.approximation)
            m2dd = data_operations.get_distance_dep_using_nodes_capturec(
                m2_subsample, nodes, nodes_idx, args.approximation)
        dd_diff = get_dd_diff(m1dd, m2dd)
        visualization.plot_dds([m1dd, m2dd], [args.m1name, args.m2name],
                               args.outdir + '/' + args.outpref + '.' +
                               args.m1name + '.vs.' + args.m2name + '.distDep',
                               args.approximation)

    print "GenomeDISCO | " + strftime(
        "%c") + " | Computing reproducibility score"
    if args.method == 'RandomWalks':
        comparer = DiscoRandomWalks(args)
    reproducibility_text, score, scores = comparer.compute_reproducibility(
        m1_norm, m2_norm, args)
    '''
    print "GenomeDISCO | "+strftime("%c")+" | Writing results"
    write_html_report(stats,args,reproducibility_text,score)
    '''
    out = open(
        args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' +
        args.m2name + '.scores.txt', 'w')
    out.write(args.m1name + '\t' + args.m2name + '\t' +
              str('{:.3f}'.format(score)) + '\n')
    out.close()
    out = open(
        args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' +
        args.m2name + '.scoresByStep.txt', 'w')
    t_strings = []
    score_strings = []
    t_counter = 0
    for t in range(1, (args.tmax + 1)):
        if t >= args.tmin:
            score_strings.append(str('{:.3f}'.format(scores[t_counter])))
            t_counter += 1
        else:
            score_strings.append('NA')
        t_strings.append(str(t))
    out.write('#m1' + '\t' + 'm2' + '\t' + '\t'.join(t_strings) + '\n')
    out.write(args.m1name + '\t' + args.m2name + '\t' +
              '\t'.join(score_strings) + '\n')
    out.close()
    out = open(
        args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' +
        args.m2name + '.datastats.txt', 'w')
    out.write('#m1name' + '\t' + 'm2name' + '\t' + 'SeqDepth.m1' + '\t' +
              'SeqDepth.m2' + '\t' + 'SubsampledSeqDepth.m1' + '\t' +
              'SubsampledSeqDepth.m2' + '\t' + 'DistDepDiff' + '\n')
    dd_value = 'NA'
    if not args.concise_analysis:
        dd_value = str('{:.10f}'.format(dd_diff))
    out.write(args.m1name + '\t' + args.m2name + '\t' +
              str(stats[args.m1name]['depth']) + '\t' +
              str(stats[args.m2name]['depth']) + '\t' +
              str(stats[args.m1name]['subsampled_depth']) + '\t' +
              str(stats[args.m2name]['subsampled_depth']) + '\t' + dd_value +
              '\n')
    out.close()
    print "GenomeDISCO | Differences by random walk step: " + '\t'.join(
        score_strings)
    print "GenomeDISCO | " + strftime("%c") + " | DONE"
Exemplo n.º 4
0
def read_in_data(mname_full, nodes):
    mat = processing.construct_csr_matrix_from_data_and_nodes(
        mname_full, nodes, True).toarray()
    mat = mat + mat.T
    return mat