Exemplo n.º 1
0
def main(job_no, coord_name, start, end, species, release, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())

    ig_count, sequence_length = 0, 0
    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    gene_count = 0
    gene_intervals = list()
    genes = genome.get_features(coord_name=coord_name,
                                start=start,
                                end=end,
                                feature_types='gene')
    for gene in genes:
        if gene.location.coord_name != coord_name:
            break
        gene_count += 1
        gene_intervals.append((gene.location.start, gene.location.end))
    gene_intervals = sorted(gene_intervals, key=lambda x: x[1])
    intergenic = interval_complement(gene_intervals)
    intergenic_sequence = ""
    for ig_interval in intergenic:
        ig_count += 1
        sequence_length += ig_interval[1] - ig_interval[0]
        region = genome.get_region(coord_name=coord_name,
                                   start=ig_interval[0],
                                   end=ig_interval[1])
        intergenic_sequence = intergenic_sequence + 'XXXXXXXXXX' + str(
            region.seq)

    LOGGER.log_message(
        str(ig_count),
        label='Number of integenic intervals processed'.ljust(30))
    LOGGER.log_message(str(sequence_length), label='Sequence length'.ljust(30))

    outfile_name = dir + '/intergenic_sequence_' + species + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(intergenic_sequence, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
def main(job_no, coord_name, start, end, species, release, folder):
    start_time = time()
    if not os.path.exists(folder):
        os.makedirs(folder)
    LOGGER.log_file_path = folder + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())

    dupl_introns, intron_count, sequence_length = 0, 0, 0
    intron_list, human_list, species_list, intron_list = list(), list(), list(
    ), list()
    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    genes = genome.get_features(coord_name=coord_name,
                                start=start,
                                end=end,
                                feature_types='gene')
    intron_sequence = 'X'
    for gene in genes:
        if gene.canonical_transcript.introns is None:
            continue
        for intron in gene.canonical_transcript.introns:
            if intron in intron_list:
                dupl_introns += 1
                continue
            intron_list.append(intron)
            intron_count += 1
            sequence_length += len(intron)
            intron_sequence = intron_sequence + 'XXXXXXXXXX' + str(intron.seq)
    outfile_name = folder + '/intronic_sequence' + species + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(intron_sequence, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()
    LOGGER.log_message(str(dupl_introns),
                       label='Number of duplicate introns rejected'.ljust(30))
    LOGGER.log_message(str(intron_count),
                       label='Number of introns processed'.ljust(30))
    LOGGER.log_message(str(sequence_length),
                       label='Total intron_length'.ljust(30))

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
Exemplo n.º 3
0
def main(job_no, infile_root, suffixes, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    try:
        LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']),
                           label="Conda environment.".ljust(17))
    except KeyError:
        pass
    LOGGER.log_message('Name = ' + np.__name__ + ', version = ' +
                       np.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' +
                       pd.__version__,
                       label="Imported module".ljust(30))

    file_suffixes = suffixes.split(',')
    total_dict = defaultdict(int)
    counts = list()
    for c in file_suffixes:
        filename = dir + '/' + infile_root + c + '.pklz'
        infile = open(filename, 'r')
        LOGGER.input_file(infile.name)
        infile.close()
        with gzip.open(filename, 'rb') as chrdict:
            chrdict = pickle.load(chrdict)
        counts.append([c, sum(chrdict.values())])
        for k in chrdict.keys():
            total_dict[k] += chrdict[k]
    counts = pd.DataFrame.from_records(counts)
    outfile_name = dir + '/' + 'merged_context_data_' + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(total_dict, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()
    fname = dir + '/' + 'merge_counts_' + job_no + '.csv'
    counts.to_csv(fname)
    outfile = open(fname, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="Run duration (minutes)".ljust(50))
Exemplo n.º 4
0
def test_read_from_written():
    """create files with different line endings dynamically"""
    text = "abcdeENDedfguENDyhbnd"
    with TemporaryDirectory(dir=TEST_ROOTDIR) as dirname:
        for ex, lf in (
            ("f06597f8a983dfc93744192b505a8af9", "\n"),
            ("39db5cc2f7749f02e0c712a3ece12ffc", "\r\n"),
        ):
            p = Path(dirname) / "test.txt"
            data = text.replace("END", lf)
            p.write_bytes(data.encode("utf-8"))
            expect = get_text_hexdigest(data)
            assert expect == ex, (expect, ex)
            got = get_file_hexdigest(p)
            assert got == expect, f"FAILED: {repr(lf)}, {(ex, got)}"
Exemplo n.º 5
0
def main(job_no, folder, fname, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(os.path.basename(__file__)) + job_no + ".log"    # change
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module ")
    LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module ")

    csv_filename = '/male_noncarrier.rmap'
    male = pd.read_csv(folder + csv_filename, sep='\t')
    male = male.sort_values(['chr', 'pos'])

    csv_filename = '/female_noncarrier.rmap'
    female = pd.read_csv(folder + csv_filename, sep='\t')
    female = female.sort_values(['chr', 'pos'])

    for chrom in range(1, 23):
        chrom = str(chrom)
        m = male[male['chr'] == 'chr' + chrom]
        f = female[female['chr'] == 'chr' + chrom]
        csv_name = dir + '/' + fname + chrom + '.csv'
        infile = open(csv_name, 'r')
        LOGGER.input_file(infile.name)
        infile.close()
        table = pd.read_csv(csv_name, sep=',', index_col=0)
        #Note we are actually matching on hg37 coordinates used by deCODE
        xtable = table.merge(m, 'left', ['chr', 'pos'], sort=True, suffixes=['_sexav', '_male'], \
                             indicator='indicm', validate='1:1')
        xtable.rename(columns={"stdrate_sexav": "stdrate", "seqbin_sexav": "seqbin"}, inplace=True)
        xtable = xtable.merge(f, 'left', ['chr', 'pos'], sort=True, suffixes=['_sexav', '_female'], \
                              indicator='indicf', validate='1:1')
        assert np.all(xtable['indicm'] == 'both'), 'Merge error with male.'
        assert np.all(xtable['indicf'] == 'both'), 'Merge error with female.'
        csv_filename = 'Recombination_data/recomb_table_all_sexes_ch' + chrom + '.csv'
        xtable.to_csv(csv_filename, sep=',')
        outfile = open(csv_filename, 'r')
        LOGGER.output_file(outfile.name)
        outfile.close()
        print(xtable.head())

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
def main_core(job_no, filename=None, n_jobs=5, context_size=9, dir='data'):
    #global sequence
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    start_time = time()
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(30))
    LOGGER.log_message('Name = ' + re.__name__ + ', version = ' +
                       re.__version__,
                       label="Imported module".ljust(30))
    infile = open(filename, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(filename, 'rb') as sequence:  # i.e. intron sequence
        sequence = pickle.load(sequence)
    context_size = int(context_size)
    contexts_generator = itertools.product('ACGT', repeat=context_size)
    contexts = tuple(''.join(context) for context in contexts_generator)
    concounts = list()
    for context in contexts:
        concount = count_single_context(context, sequence)
        concounts.append(concount)
    #concounts = Parallel(n_jobs=n_jobs)(delayed(count_single_context)(context) for context in contexts)
    context_dict = dict(zip(contexts, concounts))

    outfile_name = dir + '/context_dict_' + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(context_dict, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()
    LOGGER.log_message(str(len(context_dict.keys())),
                       label="Number of dictionary keys".ljust(30))
    LOGGER.log_message(str(sum(context_dict.values())),
                       label="Count of contexts".ljust(30))
    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
def main(job_no, coord_name, start, end, species, release, var_set_id, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' +
                       sqlalchemy.__version__,
                       label="Imported module".ljust(30))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())

    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    confirm_variation_set(genome, var_set_id)
    var_locations = get_variant_details(genome, coord_name, start, end)
    LOGGER.log_message(str(len(var_locations)),
                       label='Length of var_locations list'.ljust(30))

    outfile_name = dir + '/intergenic_variants_' + species + '_' + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(var_locations, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
Exemplo n.º 8
0
def main(job_no, sex, chroms, rank, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + job_no + ".log"  # change
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    try:
        LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']),
                           label="Conda environment.".ljust(17))
    except KeyError:
        pass
    LOGGER.log_message('Name = ' + np.__name__ + ', version = ' +
                       np.__version__,
                       label="Imported module ")
    LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' +
                       pd.__version__,
                       label="Imported module ")
    LOGGER.log_message('Name = ' + scipy.__name__ + ', version = ' +
                       scipy.__version__,
                       label="Imported module ")
    LOGGER.log_message('Name = ' + statsmodels.__name__ + ', version = ' +
                       statsmodels.__version__,
                       label="Imported module ")
    LOGGER.log_message('Name = ' + sklearn.__name__ + ', version = ' +
                       sklearn.__version__,
                       label="Imported module ")
    pd.set_option('display.max_columns', None)
    if chroms is None:
        chroms = np.arange(1, 23).astype(str).tolist()
    else:
        chroms = chroms.split(',')
    if rank:
        LOGGER.log_message("%1d" % rank,
                           label="Rank of model to select (best=0).".ljust(30))
    for chrom in chroms:
        csv_name = dir + '/recomb_table_all_sexes_ch' + chrom + '.csv'
        infile = open(csv_name, 'r')
        LOGGER.input_file(infile.name)
        infile.close()
        data_table = pd.read_csv(csv_name, sep=',', index_col=0)
        data_table = recombination.correct_missing_data(
            data_table, 'LOCF', sex)
        std_col = 'stdrate_' + sex
        std_rates = data_table[std_col].values
        variants_profiled = data_table.iloc[:, np.arange(5, 17)]
        variant_counts = variants_profiled.sum(axis=1)
        var_rates = variant_counts / 10000
        print('\n\nChromosome number   = ' + chrom)
        print('Avge. mutation rate = ', np.mean(var_rates))
        xvals = std_rates.reshape(-1, 1)
        lmodel = LinearRegression()
        lmodel.fit(xvals, var_rates)
        residuals = var_rates - lmodel.predict(xvals)
        sys.stdout.flush()
        print('Slope, intercept, mean of residuals = ',
              '%.8f' % lmodel.coef_[0], '%.8f' % lmodel.intercept_,
              '%.12f' % np.mean(residuals))
        orders = recombination.evaluate_ARMA_models(residuals, 10, 4)
        best_order = orders[rank]
        best_mdl = smt.ARMA(residuals, order=best_order).fit(method='mle',
                                                             trend='nc',
                                                             disp=0)
        print(best_mdl.summary())
        outfile_name = dir + '/ARMA_model_ch' + chrom + '_' + job_no + '.pklz'
        recombination.save_model_details(best_mdl, outfile_name)
        outfile = open(outfile_name, 'r')
        LOGGER.output_file(outfile.name)
        outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
Exemplo n.º 9
0
def main_core(job_no, species, varfile_name=None, intronfile_name=None, release=89, n_jobs=5, dir='data'):
    global genome
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(os.path.basename(__file__)) + '_' + job_no + ".log"
    start_time = time()
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(25))
    LOGGER.log_message('Name = ' + numpy.__name__ + ', version = ' + numpy.__version__,
                       label="Imported module".ljust(25))
    LOGGER.log_message('Name = ' + cogent3.__name__ + ', version = ' + cogent3.__version__,
                       label="Imported module".ljust(25))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__,
                       label="Imported module".ljust(25))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
    genome = Genome(species, release=release, account=account, pool_recycle=3600)
    human_seq_region_dict = dict(
        {'1': 131550, '2': 131545, '3': 131551, '4': 131552, '5': 131542, '6': 131555, '7': 131559,
         '8': 131560, '9': 131540, '10': 131544, '11': 131556, '12': 131546, '13': 131541,
         '14': 131547, '15': 131558,
         '16': 131549, '17': 131554, '18': 131548, '19': 131537, '20': 131538, '21': 131543,
         '22': 131557,
         'X': 131539, 'Y': 131553})
    chimp_seq_region_dict = dict({"21": 212405, "7": 212407, "15": 212409, "16": 212395, "1": 212403, "17": 212411,
                                  "18": 212410, "19": 212394, "20": 212404, "22": 212390, "3": 212392, "4": 212393,
                                  "5": 212391, "6": 212388, "8": 212397, "9": 212396, "10": 212387, "11": 212389,
                                  "12": 212402, "13": 212408, "14": 212401, "Y": 212406, "X": 212399})
    if species == 'human':
        coord_dict = dict([(v, k) for k, v in human_seq_region_dict.items()])
        tag = 'human'
    elif species == 'chimp':
        coord_dict = dict([(v, k) for k, v in chimp_seq_region_dict.items()])
        tag = 'spec_'
    else:
        assert False, 'Unknown species: ' + species
    if varfile_name is None:
        varfile_name = dir + '/var_locations_' + tag + job_no + '.pklz'
    infile = open(varfile_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(varfile_name, 'rb') as var_details:
        var_details = pickle.load(var_details)
    LOGGER.log_message(str(len(var_details)), label="Number of variants read".ljust(25))

    if intronfile_name is None:
        intronfile_name = dir + '/all_locations_' + tag + job_no + '.pklz'
    infile = open(intronfile_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(intronfile_name, 'rb') as intron_locs:
        intron_locs = pickle.load(intron_locs)
    LOGGER.log_message(str(len(intron_locs)), label="Number of introns read".ljust(25))
    with gzip.open(intronfile_name, 'rb') as intron_locs:
        intron_locs = pickle.load(intron_locs)
    var_details, var_locs_reversed = check_variant_strand(var_details, intron_locs)

#   var_details fields are: (variant name, seq region id, location, ancestral_allele, derived_allele)
    item_list = Parallel(n_jobs=n_jobs)(delayed(get_contexts) (var, coord_dict) for var in var_details)
    var_count_dict = Counter(item_list)
    del var_count_dict[None]
    outfile_name = dir + '/var_dict_' + tag + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(var_count_dict, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()
    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(25))
Exemplo n.º 10
0
def main(job_no, coord_name, start, end, species, release, var_set_id, filter,
         dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' +
                       sqlalchemy.__version__,
                       label="Imported module".ljust(30))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())

    var_locations_list, location_list = list(), list()
    dupl_introns, intron_count, bad_var_count, sequence_length = 0, 0, 0, 0
    intron_list, human_list, species_list, intron_list = list(), list(), list(
    ), list()
    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    confirm_variation_set(genome, var_set_id)
    genes = genome.get_features(coord_name=coord_name,
                                start=start,
                                end=end,
                                feature_types='gene')
    for gene in genes:
        if gene.canonical_transcript.introns is None:
            continue
        for intron in gene.canonical_transcript.introns:
            if intron in intron_list:
                dupl_introns += 1
                continue
            intron_list.append(intron)
            intron_length = len(intron)
            intron_count += 1
            sequence_length += intron_length
            loc = intron.location
            location_list.append(
                (str(loc.coord_name), loc.start, loc.end,
                 loc.strand))  # location.coord_name is db3util object
            var_locations, bad_var_num = get_variant_details(
                genome, species, intron, filter)
            var_locations_list = var_locations_list + var_locations
            bad_var_count += bad_var_num
    LOGGER.log_message(str(dupl_introns),
                       label='Number of duplicate introns rejected'.ljust(30))
    LOGGER.log_message(str(intron_count),
                       label='Number of introns processed'.ljust(30))
    if species == 'human':
        LOGGER.log_message(str(bad_var_count),
                           label='Number of rejected variants'.ljust(30))
    LOGGER.log_message(str(sequence_length), label='Sequence length'.ljust(30))
    LOGGER.log_message(str(len(var_locations_list)),
                       label='Length of var_locations list'.ljust(30))
    LOGGER.log_message(str(len(var_locations_list) / sequence_length),
                       label='Average SNV rate'.ljust(30))

    outfile_name = dir + '/var_locations_' + species + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(var_locations_list, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    outfile_name = dir + '/all_locations_' + species + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(location_list, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
def main(job_no, chroms, draws, sex, suffixes, folder):
    start_time = time()
    if not os.path.exists(folder):
        os.makedirs(folder)
    LOGGER.log_file_path = folder + "/" + str(os.path.basename(__file__)) + job_no + ".log"    # change
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17))
    try:
        LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17))
    except KeyError:
        pass
    LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module ")
    LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module ")
    LOGGER.log_message('Name = ' + theano.__name__ + ', version = ' + theano.__version__, label="Imported module ")
    LOGGER.log_message('Name = ' + pymc3.__name__ + ', version = ' + pymc3.__version__, label="Imported module ")

    #Mutation rates per chromosome are calculated from data at Jonsson et al.  Parental influence on human
    # germline de novomutations in 1,548 trios from Iceland. ('Estimate mutation rates from Jonsson data.ipynb)
    mrates = [1.1045541764661985e-08, 1.2481509352581898e-08, 1.254443516411994e-08, 1.2609734521720365e-08,
              1.216379148788216e-08, 1.2228991967962778e-08, 1.2298304077726808e-08, 1.3325693328599174e-08,
              1.0711369887343474e-08, 1.238059175011868e-08, 1.2241940318060874e-08, 1.2117457093135447e-08,
              1.0174746106096945e-08, 1.0146311894484388e-08, 1.0516600482736078e-08, 1.2597261162425896e-08,
              1.1681529656302903e-08, 1.1855256275211491e-08, 1.214570124735936e-08, 1.1756514975959873e-08,
              8.965863348091259e-09, 9.024242643357694e-09]
    result_rows = list()
    columns = ['snvdens', 'p', 'q', 'alpha', 'alpha25', 'alpha975', 'beta', 'beta25', 'beta975', 'slopem',
               'slopem25', 'slopem75', 'pval', 'r2', 'variance', 'variance25', 'variance975', 'mutprop', 'mutprop25',
               'mutprop975', 'mutperco', 'mutperco25', 'mutperco975']
    if chroms is None:
        chroms = np.arange(1, 23).astype(str).tolist()
    else:
        chroms = chroms.split(',')
    for chromplace, chrom in enumerate(chroms):
        print(chromplace, 'Chromosome ', chrom)
    if suffixes is None:
        suffixes = len(chroms) * [""]
    else:
        suffixes = suffixes.split(',')
    results = list()
    for i, chrom in enumerate(chroms):
        csv_filename = folder + '/recomb_table_all_sexes_ch' + chrom + '.csv'
        infile = open(csv_filename, 'r')
        LOGGER.input_file(infile.name)
        infile.close()
        data_table = pd.read_csv(csv_filename, sep=',', index_col=0)
        data_table = recombination.correct_missing_data(data_table, 'LOCF', sex)
        variants_profiled = data_table.iloc[:, np.arange(5, 17)]
        variant_counts = variants_profiled.sum(axis=1)
        var_rates = variant_counts / 10000
        std_col = 'stdrate_' + sex
        std_rates = data_table[std_col].values
        print('Avge. & var. of mutation rate ', np.mean(var_rates), np.var(var_rates))
        suffix = suffixes[i]
        file_name = folder + '/ARMA_model_ch' + str(chrom) + suffix + '.pklz'
        infile = open(file_name, 'r')
        LOGGER.input_file(infile.name)
        infile.close()
        with gzip.open(file_name, 'rb') as model_details:
            model_details = pickle.load(model_details)
        trace = recombination.run_MCMC_ARMApq(std_rates, var_rates, draws, model_details)
        p = model_details['order'][0]
        q = model_details['order'][1]
        neg_slope = sum(t['beta'] <= 0 for t in trace)
        print('Chrom', chrom, 'variates with slope <=0 ', neg_slope)
        print('Chrom', chrom, 'probability slope <=0 = ', neg_slope / draws)
        ss_tot = np.var(var_rates)
        ss_res_vars = [np.var(var_rates - t['alpha'] - t['beta'] * std_rates) for t in trace]
        r2_vars = 1 - (ss_res_vars / ss_tot)
        variance_variates = ss_tot - ss_res_vars
        vmean = np.mean(variance_variates)
        vlow = np.percentile(variance_variates, 2.5)
        vhigh = np.percentile(variance_variates, 97.5)
        chr_results = pd.DataFrame(variance_variates, columns=['vars'])
        chr_results['chr'] = chrom
        results.append(chr_results)
        snv_dens = np.mean(var_rates)
        intercept_mean = np.mean([t['alpha'] for t in trace])
        intercept_CI_low = np.percentile([t['alpha'] for t in trace], 2.5)
        intercept_CI_high = np.percentile([t['alpha'] for t in trace], 97.5)
        rfunc = lambda x: (snv_dens - x) / snv_dens
        mfunc = lambda x: x * mutation_rate / (snv_dens * 0.0116)
        print('Proportion muts due to recomb = ', rfunc(intercept_mean),
              'CIs = ', rfunc(intercept_CI_high), rfunc(intercept_CI_low))
        recomb_rate = np.mean(std_rates) * 0.0116 / (100 * 1e4)
        mutation_rate = mrates[i]
        mutsper = (rfunc(intercept_mean) * mutation_rate) / recomb_rate
        print('Mutations per CO = ', mutsper)
        sys.stdout.flush()
        s = summary(trace,  varnames=['alpha', 'beta'])
        result_row = [np.mean(var_rates), p, q,
                      s.loc['alpha', 'mean'],  s.loc['alpha', 'hpd_2.5'], s.loc['alpha', 'hpd_97.5'],
                      s.loc['beta', 'mean'],   s.loc['beta', 'hpd_2.5'],  s.loc['beta', 'hpd_97.5'],
                      mfunc(s.loc['beta', 'mean']), mfunc(s.loc['beta', 'hpd_2.5']), mfunc(s.loc['beta', 'hpd_97.5']),
                      neg_slope / draws, np.mean(r2_vars),
                      vmean, vlow, vhigh, rfunc(intercept_mean), rfunc(intercept_CI_low), rfunc(intercept_CI_high),
                      mutsper, (rfunc(intercept_CI_high) * mutation_rate) / recomb_rate,
                      (rfunc(intercept_CI_low) * mutation_rate) / recomb_rate]
        result_rows.append(result_row)
    results_table = pd.DataFrame(result_rows, columns=columns)
    outfile_name = folder + '/ARMApq_results_' + job_no + '.csv'
    results_table.to_csv(outfile_name)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()
    results = pd.concat(results)
    outfile_name = folder + '/ARMApq_variates_' + sex + '_'+ job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(results, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()
    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
Exemplo n.º 12
0
def main(job_no, suffix, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    try:
        LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']),
                           label="Conda environment.".ljust(17))
    except KeyError:
        pass
    LOGGER.log_message('Name = ' + np.__name__ + ', version = ' +
                       np.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' +
                       pd.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + statsmodels.__name__ + ', version = ' +
                       statsmodels.__version__,
                       label="Imported module".ljust(30))

    result = pd.DataFrame(
        columns=['kmer', 'variance', 'Marginalise over central base?'])

    #Find variance due to CpG
    filename = dir + '/var_counts_1' + suffix + '.pklz'
    infile = open(filename, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(filename, 'rb') as var_counts:
        var_counts = pickle.load(var_counts)
    filename = dir + '/context_counts_1' + suffix + '.pklz'
    infile = open(filename, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(filename, 'rb') as context_counts:
        context_counts = pickle.load(context_counts)

    cpg_contexts = context_counts.loc['CG', 'C'] + context_counts.loc['TG', 'C'] + context_counts.loc['AG', 'C'] + \
                   context_counts.loc['GG', 'C'] + \
                   context_counts.loc['CC', 'G'] + context_counts.loc['CT', 'G'] + context_counts.loc['CA', 'G'] + \
                   context_counts.loc['CG', 'G']
    CpG_ratio = cpg_contexts / context_counts.values.sum()
    non_cpg_contexts = context_counts.values.sum() - cpg_contexts
    print('Total CpG sites           : ', cpg_contexts)
    print('Total intronic sites      : ', context_counts.values.sum())
    print('Proportion CpG sites      : ', CpG_ratio)
    var_counts[
        'C'] = var_counts['C->T'] + var_counts['C->A'] + var_counts['C->G']
    var_counts[
        'G'] = var_counts['G->T'] + var_counts['G->A'] + var_counts['G->C']
    CpG_vars = var_counts.loc['CG', 'C'] + var_counts.loc['TG', 'C'] + var_counts.loc['AG', 'C'] + \
               var_counts.loc['GG', 'C'] + \
               var_counts.loc['CC', 'G'] + var_counts.loc['CT', 'G'] + var_counts.loc['CA', 'G'] + \
               var_counts.loc['CG', 'G']
    print('Total CpG variants        : ', CpG_vars)
    non_CpG_vars = var_counts.values.sum() - CpG_vars
    m1 = CpG_vars / cpg_contexts
    m0 = non_CpG_vars / non_cpg_contexts
    m_ave = var_counts.values.sum() / context_counts.values.sum()
    print('SNV density at CpG sites  : ', m1)
    print('SNV density at other sites: ', m0)
    print('Average SNV density       : ', m_ave)
    t1 = CpG_ratio * (m1 - m_ave)**2
    t2 = (1 - CpG_ratio) * (m0 - m_ave)**2
    print('Variance due to CpG sites : ', t1 + t2)
    LOGGER.log_message("%.2e" % (t1 + t2),
                       label="Variance due to CpG".ljust(50))

    #Deal with the 1-mer case.
    var_counts[
        'C'] = var_counts['C->T'] + var_counts['C->A'] + var_counts['C->G']
    var_counts[
        'T'] = var_counts['T->C'] + var_counts['T->A'] + var_counts['T->G']
    var_counts[
        'A'] = var_counts['A->T'] + var_counts['A->C'] + var_counts['A->G']
    var_counts[
        'G'] = var_counts['G->T'] + var_counts['G->A'] + var_counts['G->C']
    variant_counts = var_counts.sum(axis=0)
    variant_counts = variant_counts[variant_counts.index.isin(
        ['C', 'T', 'A', 'G'])]
    con_counts = context_counts.sum(axis=0)
    mut_rates = variant_counts / con_counts
    w = DescrStatsW(mut_rates, weights=con_counts, ddof=0)
    row = np.array([1, w.var, 'no'])
    row = pd.Series(row, index=result.columns, name=0)
    result = result.append(row)
    row = np.array([1, 0.0, 'yes'])
    row = pd.Series(row, index=result.columns, name=1)
    result = result.append(row)

    i = 2
    for kmer_variable in [1, 2, 3]:
        filename = dir + '/var_counts_' + str(kmer_variable) + suffix + '.pklz'
        infile = open(filename, 'r')
        LOGGER.input_file(infile.name)
        infile.close()
        with gzip.open(filename, 'rb') as var_counts:
            var_counts = pickle.load(var_counts)
        filename = dir + '/context_counts_' + str(
            kmer_variable) + suffix + '.pklz'
        infile = open(filename, 'r')
        LOGGER.input_file(infile.name)
        infile.close()
        with gzip.open(filename, 'rb') as context_counts:
            context_counts = pickle.load(context_counts)

        #Reformat context counts by repeating columns to match snv_densities dataframe.
        extended_context_counts, cols = probpoly_bayes.reformat_context_counts(
            context_counts, var_counts)
        extended_context_counts.set_index(context_counts.index, inplace=True)
        snv_densities = var_counts / extended_context_counts

        #Calculate variance across mutation types, marginalising over the central base.
        context_ratios = context_counts.div(context_counts.sum(axis=1), axis=0)
        extended_context_ratios, cols = probpoly_bayes.reformat_context_counts(
            context_ratios, snv_densities)
        extended_context_ratios.set_index(context_ratios.index, inplace=True)
        con_weighted = (snv_densities * extended_context_ratios).sum(axis=1)
        u = DescrStatsW(con_weighted,
                        weights=context_counts.sum(axis=1),
                        ddof=0)
        print(
            'Marginalised variance due to ' + str(2 * kmer_variable + 1) +
            ' -mers = ', u.var)
        row = np.array([2 * kmer_variable + 1, u.var, 'yes'])
        row = pd.Series(row, index=result.columns, name=i)
        result = result.append(row)
        i += 1

        #Calculate variance conditioned on kmer, not marginalising over the central base.
        #Firstly we reorganise the SNV densities table so that rows correspond to kmers
        # (including central base) and columns correspond to the derived base.
        contexts_generator = product('ACGT', repeat=2 * kmer_variable + 1)
        contexts = tuple(''.join(context) for context in contexts_generator)
        kmer_densities = np.zeros((len(contexts), 4))
        kmer_densities = pd.DataFrame(kmer_densities,
                                      index=contexts,
                                      columns=['C', 'T', 'A', 'G'])
        for context in snv_densities.index:
            for mut in snv_densities.columns:
                ref = mut[0]
                derived = mut[3]
                kmer = context[0:kmer_variable] + ref + context[
                    kmer_variable:2 * kmer_variable]
                kmer_densities.loc[kmer, derived] = snv_densities.loc[context,
                                                                      mut]

        #We also reorganise context counts into counts of kmers.
        kmer_counts = np.zeros((len(contexts)))
        kmer_counts = pd.Series(kmer_counts, index=contexts)
        for kmer in kmer_counts.index:
            context = kmer[0:kmer_variable] + kmer[kmer_variable +
                                                   1:2 * kmer_variable + 1]
            ref = kmer[kmer_variable]
            kmer_counts[kmer] = context_counts.loc[context, ref]

        #Calculate the weighted variance over the full kmer.
        v = DescrStatsW(kmer_densities.sum(axis=1),
                        weights=kmer_counts,
                        ddof=0)
        print(
            'Unmarginalised variance due to ' + str(2 * kmer_variable + 1) +
            ' -mers = ', v.var)
        row = np.array([2 * kmer_variable + 1, v.var, 'no'])
        row = pd.Series(row, index=result.columns, name=i)
        result = result.append(row)
        i += 1

    print(result)
    filename = dir + "/aggregated_results" + job_no + ".csv"
    result.to_csv(filename, sep=',')
    outfile = open(filename, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="Run duration (minutes)".ljust(50))
Exemplo n.º 13
0
def main(job_no, var_filename, context_filename, draws, prior, nocpg, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    try:
        LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']),
                           label="Conda environment.".ljust(17))
    except KeyError:
        pass
    LOGGER.log_message('Name = ' + np.__name__ + ', version = ' +
                       np.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' +
                       pd.__version__,
                       label="Imported module".ljust(30))
    draws = int(draws)

    infile_name = var_filename
    infile = open(infile_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(infile_name, 'rb') as var_data:
        var_data = pickle.load(var_data)

    infile_name = context_filename
    infile = open(infile_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(infile_name, 'rb') as context_data:
        context_data = pickle.load(context_data)

    for kmer_variable, split in zip([1, 2, 3], [1, 4, 64]):
        #for kmer_variable, split in zip([4], [1024]):
        contexts = probpoly_bayes.unpack_all_contexts_to_dataframe(
            kmer_variable, context_data)
        duration = time() - start_time
        print('Unpacked all_contexts for ', kmer_variable, '-mers at',
              "%.2f" % (duration / 60.), 'minutes.')
        sys.stdout.flush()
        variants = probpoly_bayes.unpack_var_contexts_to_dataframe(
            kmer_variable, var_data)
        duration = time() - start_time
        print('Unpacked var_contexts for ', kmer_variable, '-mers at',
              "%.2f" % (duration / 60.), 'minutes.')
        sys.stdout.flush()

        outfile_name, dataset = dir + '/var_counts_' + str(
            kmer_variable) + job_no + '.pklz', variants
        with gzip.open(outfile_name, 'wb') as outfile:
            pickle.dump(dataset, outfile)
        outfile = open(outfile_name, 'r')
        LOGGER.output_file(outfile.name)
        outfile.close()

        outfile_name, dataset = dir + '/context_counts_' + str(
            kmer_variable) + job_no + '.pklz', contexts
        with gzip.open(outfile_name, 'wb') as outfile:
            pickle.dump(dataset, outfile)
        outfile = open(outfile_name, 'r')
        LOGGER.output_file(outfile.name)
        outfile.close()

        ncols = 12
        contexts, columns = probpoly_bayes.reformat_context_counts(
            contexts, variants)
        w_var_samples = probpoly_bayes.calculate_variances(
            variants, contexts, split, prior, draws, ncols, columns)

        outfile_name = 'data/bayes_var_samples_' + job_no + '_k=' + str(
            kmer_variable) + '.pklz'
        with gzip.open(outfile_name, 'wb') as outfile:
            pickle.dump(w_var_samples, outfile)
        outfile = open(outfile_name, 'r')
        LOGGER.output_file(outfile.name)
        outfile.close()
        del w_var_samples

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="Run duration (minutes)".ljust(50))
def main(job_no, chrom, sex, species, release, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' +
                       sqlalchemy.__version__,
                       label="Imported module".ljust(30))
    human_seq_region_dict = dict({
        '1': 131550,
        '2': 131545,
        '3': 131551,
        '4': 131552,
        '5': 131542,
        '6': 131555,
        '7': 131559,
        '8': 131560,
        '9': 131540,
        '10': 131544,
        '11': 131556,
        '12': 131546,
        '13': 131541,
        '14': 131547,
        '15': 131558,
        '16': 131549,
        '17': 131554,
        '18': 131548,
        '19': 131537,
        '20': 131538,
        '21': 131543,
        '22': 131557,
        'X': 131539,
        'Y': 131553
    })
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    variation_table = genome.VarDb.get_table('variation')
    variation_feature_table = genome.VarDb.get_table('variation_feature')
    var_table = variation_table.join(
        variation_feature_table, variation_feature_table.c.variation_id ==
        variation_table.c.variation_id)

    seq_region_id = human_seq_region_dict[chrom]

    file_name = sex + '_noncarrier-hg38.csv'
    infile = open(file_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    recombination_df = pd.read_csv(file_name, usecols=[0, 1, 2, 3, 4])
    recomb_df = recombination_df.loc[lambda df: df.chr == 'chr' + chrom, :]
    recomb_df = recomb_df.reset_index(drop=True)

    mut_profiles = [
        i[0] + '->' + i[1] for i in permutations(['C', 'T', 'A', 'G'], 2)
    ]
    counts = np.zeros((recomb_df.shape[0], 21))
    counts = pd.DataFrame(counts,
                          columns=mut_profiles +
                          ['C', 'T', 'A', 'G', 'SW', 'WS', 'SS', 'WW', 'CpG'])
    for index, row in recomb_df.iterrows():
        midpoint = row.loc['pos38']
        region = genome.get_region(coord_name=chrom,
                                   start=midpoint - 5000,
                                   end=midpoint + 5000,
                                   ensembl_coord=True)
        region = str(region.seq)
        whereclause1 = and_(
            var_table.c.variation_feature_seq_region_id == seq_region_id,
            var_table.c.variation_feature_class_attrib_id == 2,
            var_table.c.variation_feature_evidence_attribs.contains('370'),
            var_table.c.variation_feature_variation_name.contains('rs'),
            var_table.c.variation_feature_somatic == 0,
            var_table.c.variation_feature_alignment_quality ==
            decimal.Decimal(1),
            var_table.c.variation_feature_minor_allele_freq.isnot(None),
            var_table.c.variation_feature_seq_region_start > midpoint - 5000,
            var_table.c.variation_feature_seq_region_start < midpoint + 5000)
        var_table_ed = var_table.select(whereclause1, use_labels=True)

        for snp in var_table_ed.execute():
            if snp['variation_ancestral_allele'] is None:
                continue
            else:
                ancestral_allele = snp['variation_ancestral_allele']
            alleles = snp['variation_feature_allele_string']
            if fnmatch(alleles, ancestral_allele + '/?'):
                derived_allele = alleles[2]
            elif fnmatch(alleles, '?/' + ancestral_allele):
                derived_allele = alleles[0]
            else:
                continue
            mtype = ancestral_allele + '->' + derived_allele
            counts.loc[index, mtype] += 1

            rel_loc = snp[
                'variation_feature_seq_region_start'] - midpoint + 5000
            if (region[rel_loc + 1] == 'G' and ancestral_allele == 'C' and derived_allele == 'T') or \
                    (region[rel_loc - 1] == 'C' and ancestral_allele == 'G' and derived_allele == 'A'):
                counts.loc[index, 'CpG'] += 1
            if ancestral_allele + derived_allele in ['CT', 'CA', 'GT', 'GA']:
                counts.loc[index, 'SW'] += 1
            if ancestral_allele + derived_allele in ['TC', 'AC', 'TG', 'AG']:
                counts.loc[index, 'WS'] += 1
            if ancestral_allele + derived_allele in ['CG', 'GC']:
                counts.loc[index, 'SS'] += 1
            if ancestral_allele + derived_allele in ['TA', 'AT']:
                counts.loc[index, 'WW'] += 1
        base_counts = Counter(region)
        for base in ['C', 'T', 'A', 'G']:
            counts.loc[index, base] = base_counts[base]

    results = pd.concat([recomb_df, counts], axis=1)
    csv_filename = 'recomb_table_SW_' + sex + '_ch' + chrom + '.csv'
    results.to_csv(csv_filename)
    outfile = open(csv_filename, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
Exemplo n.º 15
0
def main(job_no, infile_name, release, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    try:
        LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']),
                           label="Conda environment.".ljust(17))
    except KeyError:
        pass
    LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' +
                       pd.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' +
                       sqlalchemy.__version__,
                       label="Imported module".ljust(30))
    human_seq_region_dict = dict({
        '1': 131550,
        '2': 131545,
        '3': 131551,
        '4': 131552,
        '5': 131542,
        '6': 131555,
        '7': 131559,
        '8': 131560,
        '9': 131540,
        '10': 131544,
        '11': 131556,
        '12': 131546,
        '13': 131541,
        '14': 131547,
        '15': 131558,
        '16': 131549,
        '17': 131554,
        '18': 131548,
        '19': 131537,
        '20': 131538,
        '21': 131543,
        '22': 131557,
        'X': 131539,
        'Y': 131553
    })
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
    genome = Genome('human',
                    release=release,
                    account=account,
                    pool_recycle=3600)

    variation_feature_table = genome.VarDb.get_table('variation_feature')
    id_1KG = set([str(x) for x in range(42, 55)])
    var_details = pd.read_csv(infile_name, sep=',', index_col=0)
    infile = open(infile_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    loc_count, match_count, count1KG, derived_mismatch_count = 0, 0, 0, 0
    col_alleles, col_name, col_val_id = list(), list(), list()
    for row in var_details.iterrows():
        chrom = row[1].loc['chr']
        chrom = chrom[3:]
        seq_region_id = human_seq_region_dict[chrom]
        loc38 = row[1].loc['pos38']
        loc_count += 1
        whereclause1 = and_(
            variation_feature_table.c.seq_region_id == seq_region_id,
            variation_feature_table.c.seq_region_start == loc38,
            variation_feature_table.c.class_attrib_id == 2,
            variation_feature_table.c.variation_name.contains("rs"),
            variation_feature_table.c.somatic == 0,
            variation_feature_table.c.alignment_quality == decimal.Decimal(1),
            variation_feature_table.c.minor_allele_freq.isnot(None))
        query = select([
            variation_feature_table.c.variation_name,
            variation_feature_table.c.allele_string,
            variation_feature_table.c.variation_set_id
        ], whereclause1)
        snps = list(query.execute())

        if len(snps) > 0:
            if len(snps) > 1:
                print('More than one SNP at ', chrom, ':', loc38)
            alleles = snps[0][1]
            name = snps[0][0]
            match_count += 1
            if len(set(snps[0][2]).intersection(id_1KG)) > 0:
                val_id = '1KG'
                count1KG += 1
            else:
                val_id = 'Other'
        else:
            val_id = 'No match'
            name = None
            alleles = None
        col_alleles.append(alleles)
        col_name.append(name)
        col_val_id.append(val_id)
    assert var_details.shape[0] == len(col_val_id), 'Column mismatch.'
    var_details['alleles'] = pd.Series(col_alleles)
    var_details['name'] = pd.Series(col_name)
    var_details['val_id'] = pd.Series(col_val_id)
    LOGGER.log_message(str(loc_count), label='Variants read      = ')
    LOGGER.log_message(str(derived_mismatch_count),
                       label='Derived mismatches = ')
    LOGGER.log_message(str(match_count), label='Variants matched   = ')
    LOGGER.log_message(str(count1KG), label='1KG Variants       = ')
    filename = 'data/dnms_from_PRJEB21300_matched_' + job_no + '.csv'
    var_details.to_csv(filename)
    outfile = open(filename, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))