Пример #1
0
def main(argv):

    # Parse args
    if len(sys.argv) is not 2:
        sys.stderr.write("invalid usage: python " + sys.argv[0] +
                         " <level1_tss_rna_chip.json>\n")
        sys.exit(2)

    # Set args
    data_fn = sys.argv[1]
    progress = FileProgress(data_fn, "Percent Complete: ")

    # Main loop
    with open(data_fn) as json_file:
        for line in json_file:
            site = json.loads(line)

            # Filter for criteria
            analyze_this_site = False
            """
            for transcript in site['transcripts']:
                if 'tag' in transcript['attribute']:
                    if transcript['attribute']['tag'] == "CCDS":
                        analyze_this_site = True
            """

            if site['exon_number'] != 1:
                analyze_this_site = True

            # This is the main printing section
            if analyze_this_site:
                print json.dumps(site['samples'])
            progress.update()

    sys.stderr.write("\nAll done!\n")
Пример #2
0
def main(argv):

    # parse args
    if len(sys.argv) is not 3:
        sys.stderr.write("invalid usage: python " + sys.argv[0] +
                         " <all_tss_rna.json> <57epigenomes.RPKM.all> \n")
        sys.exit(2)

    json_fn = sys.argv[1]
    rna_fn = sys.argv[2]
    progress1 = FileProgress(rna_fn, "Part 1/2: ")
    progress2 = FileProgress(json_fn, "Part 2/2: ")

    # Read gene RPKM into memory
    gene_dict = {}
    header = []
    with open(rna_fn) as csv_file:
        for line in csv_file:
            row = line.strip('\n').split("\t")
            if progress1.count == 0:
                header = row[1:]
            else:
                gene = row[0]
                samples = row[1:]
                gene_dict[gene] = samples
            progress1.update()
    sys.stderr.write("\nFirst part done.\n")

    # Now read through json file and append
    with open(json_fn) as json_file:
        for line in json_file:
            site = json.loads(line, object_pairs_hook=collections.OrderedDict)
            for i in range(0, len(header)):
                if header[i] in site['samples']:
                    site['samples'][header[i]]['gene_rpkm'] = gene_dict[
                        site['gene_id']][i]
                else:
                    site['samples'][header[i]] = {
                        'gene_rpkm': gene_dict[site['gene_id']][i]
                    }
            print json.dumps(site)
            progress2.update()

    sys.stderr.write("\nAll done!\n")
Пример #3
0
results['transcript_status'] = {}
results['level'] = {}
results['transcript_type'] = {}
results['source'] = {}
results['exon_number'] = {}
results['exon_total'] = {}
results['splice_count'] = {}
results['splice_before'] = {}
results['coverage_count'] = {}
results['tss_mapped'] = {}
results['tss_total'] = {}
results['transcript_total'] = {}

gene_dict = {}

progress = FileProgress(file, "Percent: ")

counter = 0
with open(file, 'rb') as json_file:
    for line in json_file:
        site = json.loads(line)
        for transcript in site['transcripts']:
            if transcript['attribute']['transcript_status'] in results[
                    'transcript_status']:
                results['transcript_status'][transcript['attribute']
                                             ['transcript_status']] += 1
            else:
                results['transcript_status'][transcript['attribute']
                                             ['transcript_status']] = 1
            if transcript['attribute']['level'] in results['level']:
                results['level'][transcript['attribute']['level']] += 1
Пример #4
0
def main(argv):
    # parse args
    if len(sys.argv) is not 5:
        sys.stderr.write("invalid usage: python " + sys.argv[0] +
                " <all_tss.json> <57epigenomes.exon.RPKM.all> <chromosome_order.json> <granularity>\n")
        sys.exit(2)

    tss_fn = sys.argv[1]
    rna_fn = sys.argv[2]
    chromosomes_fn = sys.argv[3]
    granularity = int(sys.argv[4])
    progress1 = FileProgress(rna_fn, "Part 1/2: ")

    # load expected chromosome order from json into a dictionary
    with open(chromosomes_fn) as chromosomes_file:
        chromosomes = json.load(chromosomes_file)

    # Load JSON GTF file into memory
    gene_dict = {}
    with open(tss_fn, 'rb') as json_file:
        for line in json_file:
            gene = json.loads(line)
            gene_dict[gene['gene_id']] = gene
    sys.stderr.write("Loaded " + str(len(gene_dict)) + " genes into memory.\n")

    # Read RNA-seq data into memory
    # The purpose of this entire section is calculate leading and cassette exons
    gene_rna_dict = {}
    sample_names = {}
    with open(rna_fn) as rna_f:
        for line in rna_f:
            if progress1.count == 0:
                row = line.strip('\n').split('\t')
                for i in range(2, len(row)):
                    sample_names[row[i]] = i-2
            else:
                row = line.strip('\t\n').split('\t')
                gene = row[1]
                if gene in gene_dict:
                    start = int(row[0].split(':')[1].split('-')[0])
                    end = int(row[0].split('-')[1].split('<')[0])
                    strand = int(row[0].split('<')[1])
                    if gene not in gene_rna_dict:
                        gene_rna_dict[gene] = []
                    gene_rna_dict[gene].append( {
                        'gene' : gene,
                        'seqname' : row[0].split(':')[0],
                        'start' : start,
                        'end' : end,
                        'strand' : ('+' if strand==1 else '-'),
                        'samples' : row[2:],    # There's some weird formatting in the RPKM file
                        'tss' : (start if strand==1 else end)
                    } )
                    assert len(sample_names) == len(row[2:])
            progress1.update()

    # Main loop of genes
    progress2 = FileProgress(None, "Part 2/2: ", len(gene_rna_dict))
    sys.stderr.write("\nLoaded " + str(len(gene_rna_dict)) + " mRNA exons into memory.\n")
    for genes in sorted(gene_rna_dict.values(), key=lambda k: ( chromosomes[k[0]['seqname']], k[0]['tss'] )):
        gene = genes[0]['gene']
        if gene in gene_dict:

            # Iterate through the genes and calculate the exon number
            genes.sort(key=lambda x: x['start'])
            if genes[0]['strand'] == '+':
                for i in range(1, len(genes) + 1):
                    genes[i-1]['exon_number'] = i
            else:
                for i in range(1, len(genes) + 1):
                    genes[len(genes)-i]['exon_number'] = i

            # Calculate cell with from samples maximum value
            max_exon = None
            max_rpkm = 0
            for exon in genes:
                for sample_rpkm in exon['samples']:
                    if float(sample_rpkm) >= max_rpkm:
                        max_exon = exon['samples']
                        max_rpkm = float(sample_rpkm)
            assert max_exon is not None

            # Iterate through exons within this gene
            printlist = []
            for i in range(0, len(genes)):
                exon = genes[i]
                samples = exon['samples']

                # Assign all transcripts that map to this exon
                exon_transcripts = []
                splice_count = 0
                splice_before = 0
                coverage_count = 0
                for transcript in gene_dict[gene]['transcripts'].itervalues():
                    if transcript['tss'] > exon['start'] - granularity and transcript['tss'] < exon['end'] + granularity:
                        exon_transcripts.append(transcript)
                    for intron in transcript['introns']:
                        if exon['start'] > intron[1] or exon['end'] < intron[0]:
                            # not spliced out
                            pass
                        else:
                            splice_count += 1
                        if exon['strand'] == '+':
                            if intron[1] < exon['start']:
                                splice_before += 1
                        else:
                            if intron[0] > exon['end']:
                                splice_before += 1
                    if exon['strand'] == '+':
                        if transcript['end'] < exon['start']:
                            splice_before += 1
                    else:
                        if transcript['end'] < exon['start']:
                            if transcript['start'] > exon['end']:
                                splice_before += 1
                    for transcript_exon in transcript['exons']:
                        if exon['start'] > transcript_exon[1] or exon['end'] < transcript_exon[0]:
                            # not covered by exon
                            pass
                        else:
                            coverage_count += 1

                # If a transcript mapped to one of the exons
                if len(exon_transcripts) > 0:

                    # Save this transcript
                    d = collections.OrderedDict()
                    d['seqname'] = exon['seqname']
                    d['location'] = exon['tss']
                    d['strand'] = (1 if exon['strand']=='+' else -1)
                    d['gene_id'] = gene
                    d['exon_number'] = exon['exon_number']
                    d['exon_total'] = len(genes)
                    d['splice_count'] = splice_count
                    d['splice_before'] = splice_before
                    d['coverage_count'] = coverage_count
                    d['tss_mapped'] = len(exon_transcripts)
                    d['tss_total'] = 0
                    d['transcript_total'] = len(gene_dict[gene]['transcripts'])
                    d['transcripts'] = copy.deepcopy(exon_transcripts)
                    d['samples'] = {}
                    for sample_name, i in sample_names.iteritems():
                        if sample_name not in d['samples']:
                            d['samples'][sample_name] = {}
                        d['samples'][sample_name]['rpkm'] = float(samples[i])
                        d['samples'][sample_name]['max_rpkm'] = float(max_exon[i])
                    printlist.append(d)

            # Iterate through the exons again (this time of the accepted list)
            # We calculate the delta_rpkm to the previous transcript start site
            for k in range(0, len(printlist)):
                d = printlist[k]
                for sample_name in sample_names.iterkeys():
                    d['tss_total'] = len(printlist)
                    if len(printlist) == 1:
                         d['samples'][sample_name]['delta_rpkm'] = d['samples'][sample_name]['rpkm']
                    else:
                        try:
                            if d['strand'] == '+':
                                d['samples'][sample_name]['delta_rpkm'] = d['samples'][sample_name]['rpkm'] - printlist[k-1]['samples'][sample_name]['rpkm']
                            else:
                                d['samples'][sample_name]['delta_rpkm'] = d['samples'][sample_name]['rpkm'] - printlist[k+1]['samples'][sample_name]['rpkm']
                        except IndexError:
                            d['samples'][sample_name]['delta_rpkm'] = d['samples'][sample_name]['rpkm']
                # Delete redundant information to reduce size of the file
                for this_transcript in d['transcripts']:
                    this_transcript.pop("exons")
                    this_transcript.pop("introns")
                    this_transcript.pop("length")
                    this_transcript.pop("score")
                    this_transcript.pop("frame")
                    this_transcript.pop("feature")
                    this_transcript.pop("strand")
                    this_transcript.pop("seqname")
                    this_transcript.pop("start")
                    this_transcript.pop("end")
                print json.dumps(d)
        progress2.update()

    sys.stderr.write("\nAll done!\n")
Пример #5
0
__author__ = 'jeffrey'

import json, sys, collections
from src.utils import FileProgress

if len(sys.argv) is not 3:
    sys.stderr.write("invalid usage: python " + sys.argv[0] + " <level1_tss_rna_chip.json> <level1_tss_rna.json>\n")
    sys.exit(2)

data = sys.argv[1]
meta = sys.argv[2]


progress1 = FileProgress(meta, "Percent: ")
progress2 = FileProgress(data, "Percent: ")

# Load metadata into memory. It's going to be big.
chrom = {}
with open(meta, 'rb') as json_file:
    for line in json_file:
        site = json.loads(line)
        seqname = site.pop('seqname')
        tss = site.pop('tss')
        site.pop('strand')
        site.pop('gene_id')
        site.pop('transcripts')
        if seqname in chrom:
            chrom[seqname][tss] = site
        else:
            chrom[seqname] = { tss : site}
        progress1.update()
Пример #6
0
def main(argv):
    if not (len(sys.argv) == 4 or len(sys.argv) == 5):
        sys.stderr.write("invalid usage: python " + sys.argv[0] +
                         " <all_level1.json> <left> <right> [cutoff]\n")
        sys.exit(2)

    keys = [
        #core marks
        "H3K4me1",
        "H3K4me3",
        "H3K27me3",
        "H3K36me3",
        "H3K9me3",
        "H2A.Z",
        "H3K4me2",
        "H3K27ac",
        "H4K20me1",
        "H3K9ac",
        "DNase",
        "H3K79me2"
    ]
    ranges = (int(sys.argv[2]), int(sys.argv[3]))

    # Load file
    datapoint_list = []
    file = sys.argv[1]
    progress = FileProgress(file, "Reading file: ")
    with open(file) as json_file:
        for line in json_file:
            tss_dict = json.loads(line)
            for sample in tss_dict.values():
                remove = False
                for mark in keys:
                    if mark not in sample:
                        remove = True
                    else:
                        # Compute the feature vector
                        sum = listsum(sample[mark][ranges[0]:ranges[1]])
                        sample[mark] = sum
                if remove: continue

                # Compute the label
                sample['gene_rpkm'] = float(sample['gene_rpkm'])
                if sample['delta_rpkm'] < 0:
                    sample['delta_rpkm'] = 0
                #if sample['max_rpkm'] == 0:
                #    sample['label'] = 0
                #else:
                sample['label'] = sample['delta_rpkm']

                datapoint_list.append(sample)
            progress.update()
    sys.stderr.write("\nFinished reading file\n")
    print "Label Method: rpkm"

    # For regression, create vector
    X = []
    Y_R = []
    for datapoint in datapoint_list:

        # Assign feature vector
        exprmt_feature_vector = []
        for mark in keys:
            exprmt_feature_vector.append(datapoint[mark])

        # Assing feature label
        exprmt_label = datapoint['label']

        # Add both vectors
        X.append(exprmt_feature_vector)
        Y_R.append(exprmt_label)

    # Classify feature labels into binary space
    if len(sys.argv) == 5:
        label_cutoff = int(sys.argv[4])
    else:
        label_cutoff = np.median(Y_R)
    print "marks: " + ", ".join(keys)
    print "window: " + str(ranges[0]) + ", " + str(ranges[1])
    print "label cutoff: " + str(label_cutoff)
    Y_C = []
    for datapoint in datapoint_list:
        Y_C.append(int(float(datapoint['label']) < label_cutoff))

    print "mean label: " + str(np.mean(Y_R))
    print "median label: " + str(np.median(Y_R))
    """
    ### DUPLICATE
    # Perform the same thing for items stratified by mark type
    samples_features_and_labels={}
    for mark in sample_dicts.keys():
        samples_features_and_labels[mark]={}
        samples_features_and_labels[mark]["X"]=[]
        samples_features_and_labels[mark]["Y_R"]=[]
        samples_features_and_labels[mark]["Y_C"]=[]
        for datapoint in sample_dicts[mark]:
            exprmt_feature_vector=[]
            for mark in keys:
                exprmt_feature_vector.append(datapoint[mark])

             # Calculate feature label
            if datapoint["delta_rpkm"] < 0:
                datapoint["delta_rpkm"] = 0
            if datapoint["max_rpkm"] == 0:
                # This gene is not expressed
                exprmt_label = 0
            else:
                exprmt_label = datapoint["delta_rpkm"] / datapoint["max_rpkm"]

            samples_features_and_labels[mark]["X"].append(exprmt_feature_vector)
            samples_features_and_labels[mark]["Y_R"].append(exprmt_label)
            samples_features_and_labels[mark]["Y_C"].append(int(float(exprmt_label) < label_cutoff))
    """

    print "number of datapoints: " + str(len(Y_C))

    # Permutate the datapoints
    perms = np.random.permutation(len(X))
    X_p = []
    Y_C_p = []
    Y_R_p = []
    for i in range(0, len(X)):
        X_p.append(X[perms[i]])
        Y_C_p.append(Y_C[perms[i]])
        Y_R_p.append(Y_R[perms[i]])

    Y_C = Y_C_p
    X = X_p
    """
    for sample in samples_features_and_labels.keys():
        X_sample=samples_features_and_labels[sample]["X"]
        Y_sample_C=samples_features_and_labels[sample]["Y_C"]
        Y_sample_R=samples_features_and_labels[sample]["Y_R"]
        perms = np.random.permutation(len(X_sample))
        X_sample_p=[]
        Y_sample_R_p=[]
        Y_sample_C_p=[]
        for i in range(0,len(X_sample)):
            X_sample_p.append(X_sample[perms[i]])
            Y_sample_C_p.append(Y_sample_C[perms[i]])
            Y_sample_R_p.append(Y_sample_R[perms[i]])
        samples_features_and_labels[sample]["X"]=X_sample_p
        samples_features_and_labels[sample]["Y_C"]=Y_sample_C_p
        samples_features_and_labels[sample]["Y_R"]=Y_sample_R_p
    """

    # Regression!
    print "len Y_C: " + str(len(Y_C))
    print "len X: " + str(len(X))
    print "Starting Random Forests:"

    for n_estimators in [100, 150]:
        for depth in [4, 6]:
            clf = RandomForestClassifier(n_estimators=n_estimators,
                                         max_depth=depth,
                                         min_samples_split=10,
                                         random_state=0)

            clf.fit(X, Y_C)
            print "n_estimators, depth: " + str(n_estimators) + ", " + str(
                depth)
            feature_importances = clf.feature_importances_

            print "feature_importances: "
            for i in range(0, len(keys)):
                print "\t" + keys[i] + ":\t" + str(feature_importances[i])
            """
            print "score by experiment:"
            mean_acc=[]
            for sample in samples_features_and_labels.keys():
                Y_C_sample=samples_features_and_labels[sample]["Y_C"]
                X_sample=samples_features_and_labels[sample]["X"]
                print "\t" +sample+": "+str(len(X_sample))+" points"
                sample_scores=cross_val_score(clf,X_sample,Y_C_sample)
                mean_acc.append(np.mean(sample_scores)*(float(len(X_sample)/float(num_points))))
                print "\t"+sample+" test accuracy: "+str(np.mean(sample_scores))
            print "average sample accuracy:"+str(sum(mean_acc))
            """

            scores = cross_val_score(clf, X, Y_C)
            print "RandomForest mean cross validation score: " + str(
                scores.mean())

    print "#" * 75 + "\n"
def main(argv):
    # parse args
    if len(sys.argv) is not 5:
        sys.stderr.write(
            "invalid usage: python " + sys.argv[0] +
            " <all_tss.json> <57epigenomes.exon.RPKM.all> <chromosome_order.json> <granularity>\n"
        )
        sys.exit(2)

    tss_fn = sys.argv[1]
    rna_fn = sys.argv[2]
    chromosomes_fn = sys.argv[3]
    granularity = int(sys.argv[4])
    progress1 = FileProgress(rna_fn, "Part 1/2: ")
    progress2 = FileProgress(rna_fn, "Part 2/2: ")

    # load expected chromosome order from json into a dictionary
    with open(chromosomes_fn) as chromosomes_file:
        chromosomes = json.load(chromosomes_file)

    # Sort RNA file by gene id, so they are confirmed to be in order
    rna_f = unix_sort(rna_fn, "-k2,2 -k1,1", header=True)

    # Load JSON GTF file into memory
    gene_dict = {}
    with open(tss_fn, 'rb') as json_file:
        for line in json_file:
            gene = json.loads(line)
            gene_dict[gene['gene_id']] = gene
    sys.stderr.write("Loaded " + str(len(gene_dict)) + " genes into memory.\n")

    # Read RNA-seq data into memory
    # The purpose of this entire section is calculate leading and cassette exons
    rna_data = []
    samples = []
    previous_gene = None
    rna_file = csv.reader(rna_f, delimiter='\t')
    for row in rna_file:
        if progress1.count == 0:
            samples = row
        else:
            gene = row[1]
            if gene in gene_dict:
                if previous_gene != gene:
                    if previous_gene is not None:
                        # The previous exon was the last exon of the previous gene
                        if gene_dict[previous_gene]['strand'] == '-':
                            rna_data[-1][2] = 'leading'
                    # The current exon is the first exon of this gene
                    if gene_dict[gene]['strand'] == '+':
                        row.insert(2, 'leading')
                    else:
                        row.insert(2, 'cassette')
                else:
                    row.insert(2, 'cassette')
                rna_data.append(row)
                previous_gene = gene
        progress1.update()

    # Now sort the RNA-seq data that is in memory
    sys.stderr.write("\nBeginning to sort loaded RNA-seq file\n")
    rna_data.sort(key=lambda row: (
        chromosomes[row[0].split(':')[0]
                    ],  #chromosome, passed into chromosomes config dictionary
        (
            int(row[0].split(':')[1].split('-')[0])  #start
            if int(row[0].split('<')[1]) == 1  #if strand==1
            else int(row[0].split('-')[1].split('<')[0])  #else end
        )))
    sys.stderr.write("Finished sorting loaded RNA-seq file\n")

    # Now print the exons that have transcript start sites
    for row in rna_data:
        gene = row[1]
        if gene in gene_dict:
            seqname = row[0].split(':')[0]
            start = int(row[0].split(':')[1].split('-')[0])
            end = int(row[0].split('-')[1].split('<')[0])
            strand = int(row[0].split('<')[1])
            tss = (start if strand == 1 else end)

            # Assign all transcripts that map to this exon
            exon_transcripts = []
            splice_count = 0
            for transcript in gene_dict[gene]['transcripts'].itervalues():
                if transcript['tss'] > start - granularity and transcript[
                        'tss'] < end + granularity:
                    exon_transcripts.append(transcript)
                for intron in transcript['introns']:
                    if start > intron[1] or end < intron[0]:
                        # not spliced out
                        pass
                    else:
                        splice_count += 1

            # If a transcript mapped to one of the exons
            if len(exon_transcripts) > 0:

                # Print this transcript
                d = collections.OrderedDict()
                d['seqname'] = seqname
                d['tss'] = tss
                d['strand'] = ('+' if strand == 1 else '-')
                d['gene_id'] = gene
                d['tss_type'] = row[2]
                d['splice_count'] = splice_count
                d['transcripts'] = exon_transcripts
                d['samples'] = {}
                for i in range(3, len(samples)):
                    sample_name = samples[i]
                    if sample_name not in d['samples']:
                        d['samples'][sample_name] = {}
                    d['samples'][sample_name]['rpkm'] = row[i]
                print json.dumps(d)
                break

        progress2.update()
    sys.stderr.write("\nAll done!\n")
Пример #8
0
def main(argv):
    if len(sys.argv) is not 2:
        sys.stderr.write("invalid usage: python " + sys.argv[0] +
                         " <genes.json>\n")
        sys.exit(2)

    # Initialize Variables and report progress in file
    genes_fn = sys.argv[1]
    progress = FileProgress(genes_fn, "Percent Complete: ")

    # Load GTF file in json format
    with open(genes_fn) as json_file:
        for line in json_file:
            gene = json.loads(line)
            for transcript in gene['transcripts'].itervalues():

                exon_list = []
                for exon in transcript['exons']:
                    exon_list.append((exon['start'], exon['end']))

                transcript.pop("exons", None)
                transcript['exons'] = exon_list

                if (transcript['strand'] == "+"):
                    transcript['tss'] = transcript['start']
                    transcript['exons'].sort(key=lambda tup: tup[0])
                else:
                    transcript['tss'] = transcript['end']
                    transcript['exons'].sort(key=lambda tup: tup[1],
                                             reverse=True)
                transcript['length'] = int(transcript['end']) - int(
                    transcript['start'])

                # Generate a list of introns
                intron_list = []
                if (transcript['strand'] == "+"):
                    for i in range(0, len(transcript['exons']) - 1):
                        intron_start = transcript['exons'][i][1] + 1
                        intron_end = transcript['exons'][i + 1][0] - 1
                        intron_list.append((intron_start, intron_end))
                else:
                    for i in range(0, len(transcript['exons']) - 1):
                        intron_start = transcript['exons'][i][0] - 1
                        intron_end = transcript['exons'][i + 1][1] + 1
                        intron_list.append((intron_start, intron_end))
                transcript['introns'] = intron_list

            # Reorder the gene dictionary so it is easier to sort in the future
            d = collections.OrderedDict()
            d['gene_id'] = gene['attribute']['gene_id'].split('.')[0]
            d['seqname'] = gene['seqname']
            d['source'] = gene['source']
            d['start'] = gene['start']
            d['end'] = gene['end']
            d['strand'] = gene['strand']
            d['attribute'] = gene['attribute']
            d['transcripts'] = gene['transcripts']
            print json.dumps(d)
            progress.update()

    json_file.close()
    sys.stderr.write("\nAll Done\n")
Пример #9
0
def main(argv):
    # parse args
    if len(sys.argv) is not 4:
        sys.stderr.write(
            "invalid usage: python " + sys.argv[0] +
            " <nosplice_tss_rna.json> <nosplice_tss_chip.tsv> <experiment_read_counts.json>\n"
        )
        sys.exit(2)

    rna_fn = sys.argv[1]
    chip_fn = sys.argv[2]
    reads_fn = sys.argv[3]
    progress = FileProgress(chip_fn, "Percent Complete: ")

    # Load read count normalization file
    with open(reads_fn) as experiment_read_counts_file:
        read_counts = json.load(experiment_read_counts_file)

    # Sort both files lexically by chromosome, then by position
    sys.stderr.write('Beginning to sort both files (may take a while).\n')
    rna_f = unix_sort(rna_fn, "-k2,2 -k4,4", header=False, save=True)
    chip_f = unix_sort(chip_fn, "-t $'\t' -k1,2", header=False, save=True)
    sys.stderr.write('Finished sorting.\n')

    # Read RNA-seq data into memory
    sys.stderr.write('Reading the RNA-seq data into memory.\n')
    dict = {}
    for line in rna_f:
        site = json.loads(line, object_pairs_hook=collections.OrderedDict)
        seqname = site['seqname']
        tss = site['location']

        if seqname in dict:
            if tss in dict[seqname]:
                print "Error!"
            else:
                dict[seqname][str(tss)] = site
        else:
            dict[seqname] = {str(tss): site}
    rna_f.close()

    # Begin looping through chip file
    sys.stderr.write('Beginning to read the ChIP data.\n')
    previous_seqname = None
    previous_tss = None
    for line in chip_f:
        chip_row = line.strip("\n").split("\t")
        if len(chip_row) != 5:
            continue
        seqname, tss, sample, mark, rpm = (chip_row[0], chip_row[1],
                                           chip_row[2], chip_row[3],
                                           eval(chip_row[4]))

        if previous_tss is None:
            previous_seqname = seqname
            previous_tss = tss
            progress.update()
            continue

        if previous_seqname != seqname or previous_tss != tss:
            print json.dumps(dict[previous_seqname][previous_tss])
            dict[previous_seqname].pop(previous_tss, None)

        tss_site = dict[seqname][tss]
        correction = float(read_counts[sample][mark]) / 1000000

        if sample in tss_site['samples']:
            tss_site['samples'][sample][mark] = [x / correction for x in rpm]
        else:
            tss_site['samples'][sample] = {mark: [x / correction for x in rpm]}

        previous_seqname = seqname
        previous_tss = tss
        progress.update()
    print json.dumps(dict[previous_seqname][previous_tss])

    chip_f.close()
    sys.stderr.write("\nAll done!\n")