示例#1
0
def main():

    usage = """
%prog [options] [inputFileGlob [outputFile]]

"""
    parser = OptionParser(usage=usage, version="%prog 1.0")

    parser.add_option("-w",
                      "--windowsize",
                      dest="windowsize",
                      type="int",
                      default=None,
                      help="window size to write to different files")
    parser.add_option("-o",
                      "--outputdir",
                      dest="outputdir",
                      type="str",
                      default=None,
                      help="output dir")

    (options, args) = parser.parse_args()

    file_name_list = args

    assert options.windowsize is not None
    assert options.outputdir is not None

    if not os.path.exists(options.outputdir):
        os.makedirs(options.outputdir)


#    full_file_name_list = [l.strip() for l in file_name_list]

    meta_data, populations, regions = simons_meta_data.get_meta_data()

    #    window_iter = genome_window_iter(*file_name_list, window_size=options.windowsize, chunk_size=options.chunksize)
    window_iter = genome_window_iter(*file_name_list,
                                     window_size=options.windowsize)

    for window in window_iter:

        #        names, starts, ends, _ = list(zip(*window))
        names, starts, ends, seqs = list(zip(*window))

        assert names[1:] == names[:-1]
        assert starts[1:] == starts[:-1]
        assert ends[1:] == ends[:-1]

        outfile = os.path.join(
            options.outputdir,
            "{}-{:09d}-{:09d}.fa".format(names[0], starts[0], ends[0]))

        with open(outfile, 'w') as f:
            for (name, start, end,
                 seq), file_name in zip(window, file_name_list):
                print(">{}\n{}\n".format(file_base_name(file_name), seq),
                      file=f)
示例#2
0
sys.path.insert(0, script_dir + '/../notebooks')
import analysis_globals

parser = argparse.ArgumentParser()
parser.add_argument("--dist-dir", dest="dist_dir", type=Path)
parser.add_argument("--meta-data-dir", dest="meta_data_dir", type=Path)
parser.add_argument("--out-file", dest="out_file", type=Path)
parser.add_argument("--dist-twice-out-file", dest="dist_twice_out_file", type=Path)
parser.add_argument("--include-ust-ishim", dest="include_ust_ishim", action='store_true', default=False)
# parser.add_argument("--result-dir", dest="result_dir", type=Path)
# parser.add_argument("--result-file-prefix", dest="result_file_prefix", type=str, default='dist_data')
args = parser.parse_args()

# easy loading of meta data in a consistent manner across code
individuals, populations, regions = simons_meta_data.get_meta_data(
    meta_data_dir=args.meta_data_dir,
    include_ust_ishim=args.include_ust_ishim)


def optimize_data_frame(df, down_int='integer'):
    # down_int can be 'unsigned'
    
    converted_df = pandas.DataFrame()

    floats_optim = (df
                    .select_dtypes(include=['float'])
                    .apply(pandas.to_numeric,downcast='float')
                   )
    converted_df[floats_optim.columns] = floats_optim

    ints_optim = (df
示例#3
0
from pandas import DataFrame, Series

import simons_meta_data

script_dir = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, script_dir + '/../notebooks')
import analysis_globals

parser = argparse.ArgumentParser()
parser.add_argument("dist_file", type=Path)
parser.add_argument("dist_twice_file", type=Path)
args = parser.parse_args()

dist_data = pandas.read_hdf(args.dist_file)

individuals, populations, regions = simons_meta_data.get_meta_data(
    meta_data_dir=analysis_globals.meta_data_dir)

# dict for swapping columns
swap_dict = dict()
for colname in dist_data.columns.values:
    if colname.endswith('_1'):
        swap_dict[colname] = colname[:-2] + '_2'
    if colname.endswith('_2'):
        swap_dict[colname] = colname[:-2] + '_1'

cols = ['start', 'end', 'indiv_1', 'indiv_2', 'dist']

dist_data_twice = (pandas.concat([
    dist_data[cols], dist_data[cols].rename(columns=swap_dict)
]).sort_values(['indiv_1', 'start']).reset_index(drop=True))
示例#4
0

import simons_meta_data

individuals, populations, regions = simons_meta_data.get_meta_data()

f = open('samples.ind', 'w')

print('Chimp', file=f)

for indiv in individuals:
    chromotype = individuals[indiv]['Genetic sex assignment']
    sex = chromotype == 'XY' and 'M' or 'F'
    pop = individuals[indiv]['Population ID']
    print(indiv, sex, pop, file=f)

        
import gc

import simons_meta_data
from hg19_chrom_sizes import hg19_chrom_sizes as chromosome_lengths

parser = argparse.ArgumentParser()
parser.add_argument("--dist-dir", dest="dist_dir", type=Path)
parser.add_argument("--meta-data-dir", dest="meta_data_dir", type=Path)
parser.add_argument("--out-file", dest="out_file", type=Path)
# parser.add_argument("--result-dir", dest="result_dir", type=Path)
# parser.add_argument("--result-file-prefix", dest="result_file_prefix", type=str, default='dist_data')
args = parser.parse_args()

# easy loading of meta data in a consistent manner across code
individuals, populations, regions = simons_meta_data.get_meta_data(meta_data_dir=args.meta_data_dir)

def optimize_data_frame(df, down_int='integer'):
    # down_int can be 'unsigned'
    
    converted_df = pandas.DataFrame()

    floats_optim = (df
                    .select_dtypes(include=['float'])
                    .apply(pandas.to_numeric,downcast='float')
                   )
    converted_df[floats_optim.columns] = floats_optim

    ints_optim = (df
                    .select_dtypes(include=['int'])
                    .apply(pandas.to_numeric,downcast=down_int)