Exemplo n.º 1
0
def conservation_score(f_chromsizes, d_phastcons, in_gff, out_avcons):
    tmp = random_string(12)

    d_split = out_avcons + 'gff_by_chromosome_'+tmp
    d_cons =  out_avcons + 'conservation_'+tmp

    [os.makedirs(d) for d in [d_split, d_cons] if not os.path.exists(d)]

    f_cons      = out_avcons + 'conservation.txt'
    f_aver_cons = out_avcons

    ## get chromosomes
    chromosomes = []
    with open(f_chromsizes) as f:
        for l in f:
            c = l.split('\t')[0]
            if (('random' not in c) and ('chrM' not in c) and ('chrUn' not in c)):
                chromosomes.append(c[3:])

    ## separate infile by chromosome
    for c in chromosomes:
        f_out = os.path.join(d_split, 'tss_filtered_all_'+c+'.gff')
        with open(f_out, 'w') as out:
            with open(in_gff) as f:
                for line in f:
                    chrom = line.split('\t')[0]
                    if (chrom == c):
                        out.write(line)

    ## calculate conservation per chromosome
    _conservation(chromosomes, d_split, d_cons, d_phastcons)

    ## merge chromosomes
    os.system("cat "+d_cons+"/conservation_all_*txt > "+f_cons)

    ## get average conservation
    _average_conservation(f_cons, f_aver_cons)

    ## cleanup
    is_same = []
    for c in chromosomes:
        n_gff = line_count(os.path.join(d_split, 'tss_filtered_all_'+c+'.gff'))
        n_con = line_count(os.path.join(d_cons,  'conservation_all_'+c+'.txt'))
        is_same.append(n_gff == n_con)
    if all_same(is_same):
        os.system('rm -r %s %s %s' % (d_split, d_cons, f_cons))
    else:
        not_equal = [chromosomes[i] for i,v in enumerate(is_same) if not v]
        sys.exit('Error: Total number of positions does not match for chr: ' + ' '.join(not_equal))

    ## sort average conservation
    sorted_avcons = out_avcons + '.sorted.tmp'
    cmd = "sort -k1,2 -n "+out_avcons+" > "+sorted_avcons
    os.system(cmd)
    return sorted_avcons
Exemplo n.º 2
0
    def load_main_file(self):
        """ Start asynchronously loading stored filename. If we land in this method we are assured that the store filename is valid. 
            When loading is done, method file_loaded_cb is called"""

        # Lock all inputs, clear the plot
        self.rnd_doc_btn.config(state="disabled")
        self.ready_lb.config(bg="red")
        self.dp_plt_wg.config(state="disabled")
        utils.Plotting.reset_plot(self.mpl_ax)
        self.canvas.draw()
        self.txt_entry_doc.delete(0, END)
        self.txt_entry_doc.insert(0, self.text_entry_default)
        self.dp_plt_var.set('None')
        self.status_var.set("Loading data file")

        lines = utils.line_count(self.model.filename)
        self.progressbar["maximum"] = ceil(
            lines /
            utils.CHUNK_SIZE)  # int(X) + 1 doesn't work for whole values
        self.file_info_var.set("{0} entries".format(lines))
        self.model.load_main_file_async(
            {
                'filename': self.model.filename,
                'linecount': lines
            },
            callback=self.file_loaded_cb,
            pg_val=self.pg_val)
Exemplo n.º 3
0
def build_features_matrix(sorted_gff, sorted_cpg, sorted_avcons, sorted_tata, f_out):
    ## check that all in files contain same number of data lines
    n_g = line_count(sorted_gff)
    n_c = line_count(sorted_cpg)
    n_a = line_count(sorted_avcons)
    n_t = line_count(sorted_tata)
    if not all_same([n_g, n_c, n_a, n_t]):
        sys.exit('Error: line count of feature files are not all equal:%s,%s,%s,%s' %
            n_g, n_c, n_a, n_t)

    ## create matrix
    lcount = 0
    with open(f_out, 'w') as out:
        with open(sorted_gff) as f:
            for l in f:
                lcount += 1

                l = l.strip().split('\t')
                c      = l[0]
                region_up   = l[3] #500bp   upstream of start; not used
                region_down = l[4] #500bp downstream of start; not used
                count  = l[5]
                strand = l[6]

                info = l[8].split(';')
                #dist_score = '?'

                peak_start = get_value_from_keycolonvalue_list('start', info)
                peak_stop  = get_value_from_keycolonvalue_list('stop', info)

                CpG_value    = linecache.getline(sorted_cpg,lcount).strip().split('\t')[3]
                try:
                    conservation = linecache.getline(sorted_avcons,lcount).strip().split('\t')[2]
                except:
                    conservation = '0'

                affinity     = linecache.getline(sorted_tata,lcount).strip().split('\t')[7]

                features = ';'.join(['cpg:'+CpG_value, 'cons:'+conservation, 'tata:'+affinity])
                new_info = ';'.join(['region_start:'+region_up, 'region_stop:'+region_down])
                line = '\t'.join([c, l[1], l[2],
                                  peak_start, peak_stop, count, strand,
                                  features, new_info])
                out.write(line + '\n')
    def __init__(self, name, work_dir, data_dir, output_root):
        super(Configuration, self).__init__()
        # file related params
        self.work_dir = work_dir
        self.data_dir = data_dir
        self.vocab_path = os.path.join(self.data_dir, 'vocabulary')
        self.embed_path = os.path.join(self.data_dir, 'glove_embedding.npy')
        self.vocab_size = line_count(self.vocab_path, skip_empty=True)

        with setups(self):
            with immutables(self):
                self.start_time = current_datetime()
                self.name = name
                self.output_dir = os.path.join(output_root, name)
                self.train_path = os.path.join(self.data_dir, 'trainyiseg.csv')
                self.train_eval_path = os.path.join(self.data_dir, 'trainyiseg_eval.csv')
                self.valid_path = os.path.join(self.data_dir, 'validyiseg.csv')
                self.test_path = os.path.join(self.data_dir, 'testayiseg.csv')
                self.model_path = os.path.join(self.output_dir, 'model')
                self.elmo_path = os.path.join(self.data_dir, 'elmo', 'model')
                self.num_aspects = 20
                self.visible_gpus = '0'  # visible GPUs
                self.num_gpus = len(self.visible_gpus.split(','))
                os.environ['CUDA_VISIBLE_DEVICES'] = self.visible_gpus

        with structures(self):
            self.hidden_size = 512
            self.embed_size = 300
            self.atn_units = 300
            self.num_layers = 1
            self.use_elmo = False

        with learning(self):
            # training process params
            self.load_embed = True
            self.keep_prob = 0.65
            self.rnn_kernel_keep_prob = 0.8
            self.max_epoch = 50
            self.grad_clip_max_norm = 5.0
            self.early_stop_epoch = 10

            # input params
            self.batch_size = 64
            self.eval_batch_size = 64
Exemplo n.º 5
0
 def execute_task(self, args):
     """ Description : Stores sanitized argument variables and dispatches task to the relevant method
         Parameters  : Parsed CLI arguments
     """
     self.args = args
     if self.model.check_file_validity(args.input_file):
         self.load_main_file({
             'filename':
             args.input_file,
             'linecount':
             utils.line_count(str(args.input_file))
         })
         self.doc = self.get_doc(args.doc_uuid, args.task_id)
         self.user = self.get_user(args.user_uuid)
         if args.task_id in ['4d', '5']:
             self.sort = self.get_sort(args.sort)
         task_func = self.dispatch.get(args.task_id)
         task_func()
     else:
         raise utils.InvalidArgumentError("Input file is not valid !")
Exemplo n.º 6
0
def load_data(base_name, plevel, ulevel, hlength, sv=False):
    """
    Load and pre-format the Foodmart data (products, customers and user sessions).

    Args:
     * ``base_name`` (*str*): path to the main data folder.
     * ``plevel`` (*int*): level parameter for the product clustering.
     * ``hlength`` (*int*): history length.
     * ``sv`` (*bool, optional*): if True, store the computed informations in .items, .profiles, .train and .test

    Returns:
     * ``product_to_cluster`` (*ndarray*): maps a productID to a clusterID. Note 0 -> -1 is the empty selection.
     * ``customer_to_cluster`` (*ndarray*): maps a customerID to a clusterID.
    """

    # Init output folder
    if sv:
        output_base = init_output_dir(plevel, ulevel, hlength)


    ###### Load and Cluster items
    #########################################################################

    print("\n\033[92m-----> Load and Cluster products\033[0m")
    product_to_cluster = np.zeros(line_count(load_datafile(base_name, "product.csv")) + 1, dtype=int)      # Product ID -> Cluster ID
    tmp_index = {}                          # Cluster name -> Cluster ID
    tmp_clusters = defaultdict(lambda: [])  # Cluster name -> Product ID list

    # Load product list
    if plevel == 0:
        f = load_datafile(base_name, "product.csv")
        r = csv.reader(f)
        next(r)
        for product in r:
            tmp_clusters[product[3]].append(int(product[1]))
            try:
                product_to_cluster[int(product[1])] = tmp_index[product[3]]
            except KeyError:
                tmp_index[product[3]] = len(tmp_index) + 1
                product_to_cluster[int(product[1])] = tmp_index[product[3]]
        f.close()

    else:
        # Load product categories
        product_classes = {}
        f = load_datafile(base_name, "product_class.csv")
        r = csv.reader(f)
        next(r)
        for categories in r:
            product_classes[int(categories[0])] = categories[plevel]
        f.close()

        # Cluster products
        f = load_datafile(base_name, "product.csv")
        r = csv.reader(f)
        next(r)
        for product in r:
            try:
                product_to_cluster[int(product[1])] = tmp_index[product_classes[int(product[0])]]
            except KeyError:
                tmp_index[product_classes[int(product[0])]] = len(tmp_index) + 1
                product_to_cluster[int(product[1])] = tmp_index[product_classes[int(product[0])]]
            tmp_clusters[product_classes[int(product[0])]].append(int(product[1]))
        f.close()

    # Print summary
    print("   %d product profiles (%d products)" % (len(tmp_index), (len(product_to_cluster) - 1)))
    print('\n'.join("     > %s: %.2f%%" % (k, 100 * float(len(v)) / (len(product_to_cluster) - 1)) for k, v in iteritems(tmp_clusters)))
    actions = sorted(itervalues(tmp_index))
    product_to_cluster[0] = 0 # Empty selection

    # Init states
    print("\n\033[92m-----> [Optional] Export states description\033[0m")
    init_base_writing(len(actions), args.history)
    if sv:
        rv_tmp_indx = {v: k for k, v in tmp_index.items()}
        rv_tmp_indx[0] = str(chr(35))
        with open("%s.states" % output_base, 'w') as f:
            f.write('\n'.join("%f\t%s" % (x, '|'.join(rv_tmp_indx[y] for y in id_to_state(x))) for x in xrange(get_nstates(len(actions), args.history))))

    ###### Load and Store user sessions
    #########################################################################

    print("\n\033[92m-----> Load user sessions and shop profits \033[0m")
    user_sessions = defaultdict(lambda: [0] * hlength)

    # Load session
    f = load_datafile(base_name, "sales.csv")
    r = csv.reader(f)
    next(r)
    for sale in r:
        product_clusterID = product_to_cluster[int(sale[0])]
        user_sessions[int(sale[2])].append(product_clusterID)
    f.close()

    # Save product clusters information
    if sv:
        with open("%s.items" % output_base, 'w') as f:
            f.write('\n'.join("%d\t%s\t%d" %(tmp_index[k], k, len(tmp_clusters[k])) for k in sorted(tmp_index.keys(), key=lambda x: tmp_index[x])))

    # Return values
    return product_to_cluster, user_sessions, actions, output_base
Exemplo n.º 7
0
data_dir_path = Path('data')
file_names = [x.name for x in data_dir_path.glob('*') if x.is_file()]

#prints the sizes of both datasets
import os
print('---Size of each dataset---\n')
print('Super Bowl History:')
print(os.path.getsize('data/datasets_superbowl.csv'))
print('\nSuper Bowl Ads:')
print(os.path.getsize('data/datasets_superbowl-ads.csv'))

#prints the amount of rows that are in each dataset
from utils import line_count
print("---Number of rows from each dataset---\n")
print("Super Bowl History:")
line_num = line_count('data/datasets_superbowl.csv')
print(line_num)
print("\nSuper Bowl Ads:")
line_num = line_count('data/datasets_superbowl-ads.csv')
print(line_num)

#prints all columns and one row from each dataset
from utils import head
print("All columns and one row from each dataset\n")
print("Super Bowl History:")
print(head('data/datasets_superbowl.csv', 2))
print("\nSuper Bowl Ads:")
print(head('data/datasets_superbowl-ads.csv', 2))

#-------------------------------------------------------#
Exemplo n.º 8
0
import numpy as np

from utils import line_count

import ipdb

rd = redis.StrictRedis()

raw_dict = open('data/resource.txt', 'r').read()
raw_dict = literal_eval(raw_dict)
entity_lst = raw_dict['csk_entities']

if not os.path.isfile('freq_dict.pkl'):
    freq_dict = Counter()

    num_lines = line_count('data/trainset.jsonl')
    with jsonlines.open('data/trainset.jsonl', mode='r') as reader:
        for _, line in zip(tqdm(range(num_lines)), reader):
            freq_dict.update(line['post'])
            freq_dict.update(line['response'])

    with open('freq_dict.pkl', 'wb') as f:
        pickle.dump(freq_dict, f)

with open('freq_dict.pkl', 'rb') as f:
    freq_dict = pickle.load(f)


def is_in_khop(k_exp):
    reply = rd.execute_command(
        'GRAPH.QUERY', 'CCM',
Exemplo n.º 9
0
        def process_file(root, inp):
            start_i, filename = inp
            n_sample = line_count(filename)

            post = np.zeros((n_sample, self.args.max_sentence_len),
                            dtype=np.int32)
            post_length = np.zeros(
                (n_sample), dtype=np.int32)  # valid length (without pad)
            response = np.zeros((n_sample, self.args.max_sentence_len),
                                dtype=np.int32)
            response_length = np.zeros((n_sample), dtype=np.int32)
            # post_triple = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32)
            triple = np.zeros((n_sample, self.args.max_sentence_len,
                               self.args.max_triple_len, 3),
                              dtype=np.int32)
            entity = np.zeros((n_sample, self.args.max_sentence_len,
                               self.args.max_triple_len),
                              dtype=np.int32)
            response_triple = np.zeros(
                (n_sample, self.args.max_sentence_len, 3), dtype=np.int32)

            max_post_len, max_response_len, max_triple_len = 0, 0, 0

            with jsonlines.open(filename) as df:
                for i, line in enumerate(df):

                    pl, rl = len(line['post']) + 2, len(line['response']) + 2
                    post_length[i] = pl
                    response_length[i] = rl

                    max_post_len = max(pl, max_post_len)
                    max_response_len = max(rl, max_response_len)
                    max_triple_len = max([len(l)
                                          for l in line['all_triples']] +
                                         [max_triple_len])

                    all_triples = [
                        line['all_triples'][i - 1] if i > 0 else [-1]
                        for i in line['post_triples']
                    ]

                    post[i, :pl] = [SOS_IDX] + [
                        self.get_word_idx(p) for p in line['post']
                    ] + [EOS_IDX]
                    response[i, :rl] = [SOS_IDX] + [
                        self.get_word_idx(r) for r in line['response']
                    ] + [EOS_IDX]
                    # post_triple[i, 1:pl-1] = np.array(line['post_triples']) # [0, 0, 1, 0, 2...]
                    response_triple[i, :rl] = [NAF_TRIPLE] + [
                        transform_triple_to_hrt(rt)
                        for rt in line['response_triples']
                    ] + [NAF_TRIPLE]

                    # put NAF_TRIPLE/entity at index 0
                    triple[i] = pad_2d(
                        [[NAF_TRIPLE]] +
                        [[transform_triple_to_hrt(t) for t in triples]
                         for triples in all_triples] + [[NAF_TRIPLE]],
                        length=(self.args.max_sentence_len,
                                self.args.max_triple_len, 3))
                    entity[i] = pad_2d(
                        [[NAF_IDX]] +
                        [[self.entidx2wordidx[e] for e in entities]
                         for entities in line['all_entities']] + [[NAF_IDX]],
                        length=(self.args.max_sentence_len,
                                self.args.max_triple_len))

                # dump to zarr
                root['post'][start_i:start_i + n_sample] = post
                root['post_length'][start_i:start_i + n_sample] = post_length
                root['response'][start_i:start_i + n_sample] = response
                root['response_length'][start_i:start_i +
                                        n_sample] = response_length
                # root['post_triple'][start_i : start_i+n_sample] = post_triple
                root['triple'][start_i:start_i + n_sample] = triple
                root['entity'][start_i:start_i + n_sample] = entity
                root['response_triple'][start_i:start_i +
                                        n_sample] = response_triple

            return max_post_len, max_response_len, max_triple_len
Exemplo n.º 10
0
    def init_data(self, data_name, n_chunk=1024):
        print(f'Initializing {data_name} data...')

        def transform_triple_to_hrt(triple_idx):
            """ Transforms triple-idx (as a whole) to h/r/t format """
            if triple_idx == -1:  # for response_triple
                return NAF_TRIPLE
            triple = self.idx2triple[triple_idx]
            h, r, t = triple.split(', ')
            return [self.word2idx[h], self.rel2idx[r], self.word2idx[t]]

        def process_file(root, inp):
            start_i, filename = inp
            n_sample = line_count(filename)

            post = np.zeros((n_sample, self.args.max_sentence_len),
                            dtype=np.int32)
            post_length = np.zeros(
                (n_sample), dtype=np.int32)  # valid length (without pad)
            response = np.zeros((n_sample, self.args.max_sentence_len),
                                dtype=np.int32)
            response_length = np.zeros((n_sample), dtype=np.int32)
            # post_triple = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32)
            triple = np.zeros((n_sample, self.args.max_sentence_len,
                               self.args.max_triple_len, 3),
                              dtype=np.int32)
            entity = np.zeros((n_sample, self.args.max_sentence_len,
                               self.args.max_triple_len),
                              dtype=np.int32)
            response_triple = np.zeros(
                (n_sample, self.args.max_sentence_len, 3), dtype=np.int32)

            max_post_len, max_response_len, max_triple_len = 0, 0, 0

            with jsonlines.open(filename) as df:
                for i, line in enumerate(df):

                    pl, rl = len(line['post']) + 2, len(line['response']) + 2
                    post_length[i] = pl
                    response_length[i] = rl

                    max_post_len = max(pl, max_post_len)
                    max_response_len = max(rl, max_response_len)
                    max_triple_len = max([len(l)
                                          for l in line['all_triples']] +
                                         [max_triple_len])

                    all_triples = [
                        line['all_triples'][i - 1] if i > 0 else [-1]
                        for i in line['post_triples']
                    ]

                    post[i, :pl] = [SOS_IDX] + [
                        self.get_word_idx(p) for p in line['post']
                    ] + [EOS_IDX]
                    response[i, :rl] = [SOS_IDX] + [
                        self.get_word_idx(r) for r in line['response']
                    ] + [EOS_IDX]
                    # post_triple[i, 1:pl-1] = np.array(line['post_triples']) # [0, 0, 1, 0, 2...]
                    response_triple[i, :rl] = [NAF_TRIPLE] + [
                        transform_triple_to_hrt(rt)
                        for rt in line['response_triples']
                    ] + [NAF_TRIPLE]

                    # put NAF_TRIPLE/entity at index 0
                    triple[i] = pad_2d(
                        [[NAF_TRIPLE]] +
                        [[transform_triple_to_hrt(t) for t in triples]
                         for triples in all_triples] + [[NAF_TRIPLE]],
                        length=(self.args.max_sentence_len,
                                self.args.max_triple_len, 3))
                    entity[i] = pad_2d(
                        [[NAF_IDX]] +
                        [[self.entidx2wordidx[e] for e in entities]
                         for entities in line['all_entities']] + [[NAF_IDX]],
                        length=(self.args.max_sentence_len,
                                self.args.max_triple_len))

                # dump to zarr
                root['post'][start_i:start_i + n_sample] = post
                root['post_length'][start_i:start_i + n_sample] = post_length
                root['response'][start_i:start_i + n_sample] = response
                root['response_length'][start_i:start_i +
                                        n_sample] = response_length
                # root['post_triple'][start_i : start_i+n_sample] = post_triple
                root['triple'][start_i:start_i + n_sample] = triple
                root['entity'][start_i:start_i + n_sample] = entity
                root['response_triple'][start_i:start_i +
                                        n_sample] = response_triple

            return max_post_len, max_response_len, max_triple_len

        toread = [
            f'{self.data_path}/{data_name}set_pieces/{piece}'
            for piece in os.listdir(f'{self.data_path}/{data_name}set_pieces')
        ]
        n_lines = sum([line_count(piece) for piece in toread])
        init_n_lines = math.ceil(
            n_lines /
            n_chunk) * n_chunk  # 마지막 조각 사이즈가 지정된 청크 사이즈보다 작아져서 나는 에러 방지

        root = zarr.open(f'{self.data_path}/{data_name}set_new.zarr', mode='w')
        post = root.zeros('post',
                          shape=(init_n_lines, self.args.max_sentence_len),
                          chunks=(n_chunk, None),
                          dtype='i4')
        post_length = root.zeros('post_length',
                                 shape=(init_n_lines, ),
                                 chunks=(n_chunk, ),
                                 dtype='i4')  # valid length (without pad)
        response = root.zeros('response',
                              shape=(init_n_lines, self.args.max_sentence_len),
                              chunks=(n_chunk, None),
                              dtype='i4')
        response_length = root.zeros('response_length',
                                     shape=(init_n_lines, ),
                                     chunks=(n_chunk, ),
                                     dtype='i4')
        post_triple = root.zeros('post_triple',
                                 shape=(init_n_lines,
                                        self.args.max_sentence_len),
                                 chunks=(n_chunk, None),
                                 dtype='i4')
        triple = root.zeros('triple',
                            shape=(init_n_lines, self.args.max_sentence_len,
                                   self.args.max_triple_len, 3),
                            chunks=(n_chunk, None, None, None),
                            dtype='i4')
        entity = root.zeros('entity',
                            shape=(init_n_lines, self.args.max_sentence_len,
                                   self.args.max_triple_len),
                            chunks=(n_chunk, None, None),
                            dtype='i4')
        response_triple = root.zeros('response_triple',
                                     shape=(init_n_lines,
                                            self.args.max_sentence_len, 3),
                                     chunks=(n_chunk, None, None),
                                     dtype='i4')

        pool = Pool(min(len(toread), mp.cpu_count()))
        func = functools.partial(process_file, root)
        iterinp = [(i * self.args.data_piece_size, filename)
                   for i, filename in enumerate(toread)]
        max_post_lens, max_response_lens, max_triple_lens = zip(
            *tqdm(pool.imap(func, iterinp), total=len(iterinp)))

        max_post_len, max_response_len, max_triple_len = max(
            max_post_lens), max(max_response_lens), max(max_triple_lens)

        # trim remaining space
        post.resize(n_lines, max_post_len)
        post_length.resize(n_lines)
        response.resize(n_lines, max_response_len)
        response_length.resize(n_lines)
        post_triple.resize(n_lines, max_post_len)
        triple.resize(n_lines, max_post_len, max_triple_len, 3)
        entity.resize(n_lines, max_post_len, max_triple_len)
        response_triple.resize(n_lines, max_response_len, 3)

        print(
            f'Dumped {data_name} at: {self.data_path}/{data_name}set_new.zarr')
Exemplo n.º 11
0
def main(files, outdir, N, percent_lib, is_get_id, f_config, verbose=False):
    if os.path.isdir(outdir):
        sys.exit('## ERROR: "%s" already exists' % outdir)

    cparser = SafeConfigParser()
    cparser.read(f_config)
    verbose = True

    f_mirbasegff = cparser.get('mirbase', 'gff2')
    f_chromsizes = cparser.get('genome', 'chromsizes')
    f_repeats = cparser.get('genome', 'repeats')
    f_ensembl = cparser.get('genome', 'ensemblgtf')
    f_fasta = cparser.get('genome', 'fasta')
    d_phastcons = cparser.get('cons', 'phastcons')
    TRAP = cparser.get('tata', 'trap')
    f_psemmatrix = cparser.get('tata', 'psem')
    f_traincfg = cparser.get('configs', 'tcconfig')
    m_mirna = cparser.get('correlation', 'srnaseqmatrix')
    m_tss = cparser.get('correlation', 'cageseqmatrix')
    corrmethod = cparser.get('correlation', 'corrmethod')

    f_trainingset = os.path.join(outdir, 'TrainingSet.gff')
    outdir1 = f_trainingset + '_intermediates'

    ensure_dir(outdir, False)
    ensure_dir(outdir1, False)

    _files = glob.glob(files)

    ## creating auxillary file for negative set
    f_fiveprimegff = '../data/hsa.five_prime.gff'
    if not os.path.exists(f_fiveprimegff):
        if verbose:
            print 'STATUS: creating "%s" auxillary file...' % f_fiveprimegff
        extract_tss_from_ensembl(f_ensembl, f_fiveprimegff)

    ## create training set
    gff_ts_pos = os.path.join(outdir1, 'trainingset_pos.gff')
    gff_ts_neg = os.path.join(outdir1, 'trainingset_neg.gff')
    if verbose: print 'STATUS: creating positive candidate set...'
    create_positiveset(percent_lib, _files, f_mirbasegff, N, gff_ts_pos,
                       is_get_id)
    if verbose: print 'STATUS: creating negative candidate set...'
    create_negativeset(f_chromsizes, f_repeats, f_fiveprimegff, f_traincfg, N,
                       gff_ts_neg)

    shutil.move(os.path.join(outdir1, 'tc-norm_negSet'),
                os.path.join(outdir, 'tc-norm_negSet'))

    ## feature extraction: cpg, cons, tata (features.py)
    if verbose: print 'STATUS: extracting features cpg/cons/tata...'
    gff_1kbfeatures_pos = os.path.join(outdir1, 'features1kb_ts_pos.gff')
    gff_1kbfeatures_neg = os.path.join(outdir1, 'features1kb_ts_neg.gff')

    features.main(gff_ts_pos, outdir1, f_fasta, f_chromsizes, d_phastcons,
                  TRAP, f_psemmatrix, gff_1kbfeatures_pos)

    features.main(gff_ts_neg, outdir1, f_fasta, f_chromsizes, d_phastcons,
                  TRAP, f_psemmatrix, gff_1kbfeatures_neg)

    ## feature extraction: mirna_proximity
    if verbose: print 'STATUS: extracting features mirna_proximity...'
    gff_mirnaprox_pos = os.path.join(outdir1, 'featureMprox_ts_pos.gff')
    gff_mirnaprox_neg = os.path.join(outdir1, 'featureMprox_ts_neg.gff')
    mirna_proximity.main(gff_ts_pos, f_mirbasegff, gff_mirnaprox_pos)
    mirna_proximity.main(gff_ts_neg, f_mirbasegff, gff_mirnaprox_neg)

    gff_features_pos = os.path.join(outdir1, 'Features_ts_pos.gff')
    gff_features_neg = os.path.join(outdir1, 'Features_ts_neg.gff')
    gff_unify_features.main(gff_1kbfeatures_pos, gff_mirnaprox_pos,
                            'mirna_prox', '0', gff_features_pos, True)
    gff_unify_features.main(gff_1kbfeatures_neg, gff_mirnaprox_neg,
                            'mirna_prox', '0', gff_features_neg, True)

    ## create final training set ...
    ## where background must pass criteria: cpg <= 0.5 and cons <= 0.2 and tata <= 0.1 and mirna_prox == 0
    if verbose: print 'STATUS: creating final training set...'
    good_background = gff_features_neg + '_cpglt0.5-conslt0.2-tatalt0.1-mproxeq0.gff'
    with open(good_background, 'w') as out:
        with open(gff_features_neg) as f:
            for line in f:
                info = line.strip().split('\t')[7].split(';')
                cpg = float(get_value_from_keycolonvalue_list('cpg', info))
                cons = float(get_value_from_keycolonvalue_list('cons', info))
                tata = float(get_value_from_keycolonvalue_list('tata', info))
                mprx = float(
                    get_value_from_keycolonvalue_list('mirna_prox', info))

                if cpg <= 0.5 and cons <= 0.2 and tata <= 0.1 and mprx == 0:
                    out.write(line)

    wc = line_count(good_background)
    selectedlines = random.sample(range(1, wc + 1), N)

    with open(f_trainingset, 'w') as out:
        ## writing negative set
        for l in selectedlines:
            out.write(linecache.getline(good_background, l))

        ## writing positive set
        with open(gff_features_pos) as f:
            ## when mirna_prox extraction feature was used,
            ## extracted all pairs within 50kb upstream mirna
            ## -> single tss could have many mirna
            ## take pair with min distance
            ## -> essential first entry
            pos_list = []
            for line in f:
                l = line.split('\t')
                pos = ','.join([l[0], l[3], l[4], l[6]])
                if not (pos in pos_list):
                    pos_list.append(pos)
                    out.write(line)

    if not (os.path.isfile(m_mirna) and os.path.isfile(m_tss)):
        return f_trainingset

    ## create final training set with feature:correlation of closest tss->miRNA ...
    if verbose:
        print 'STATUS: creating final training set with correlation of closest tss->miRNA...'
    f_trainingset2 = os.path.join(outdir, 'TrainingSet-corr.gff')
    m_back = glob.glob('%s/tc-norm_negSet/*tpm_rle.matrix' % outdir)[0]
    f_tcfilesinput = os.path.join(outdir, 'tc-norm_negSet', 'files.txt')

    feature_closest_corr(f_trainingset, f_mirbasegff, m_mirna, m_tss, m_back,
                         f_tcfilesinput, corrmethod, f_trainingset2)

    return f_trainingset2