예제 #1
0
def cast_data(header, tablename, data):
    typedict = get_typedict(tablename)
    type_casters = []
    for i in range(len(header)):
        sql_type = typedict[header[i]]
        if sql_type == text_type:
            type_casters.append(lambda str: str.encode('UTF-8'))
            #type_casters.append(lambda passer: passer)
        elif sql_type == int_type:
            type_casters.append(int)
        elif sql_type == date_type:
            type_casters.append(timestamp_parser.parse)

    log('casting data for ' + str(len(data)) + " rows")

    def cast_line(dataln):
        cast_line = []
        for col_id in range(len(dataln)):
            cast_line.append(type_casters[col_id](dataln[col_id]))
        return cast_line

    tpool = Pool(processes=6)
    ret = tpool.map(cast_line, data)
    tpool.close()
    return ret
예제 #2
0
 def runMultiProcessTrajectories(self, repeat):
     pool=Pool(processes=len(self.posIni))
     result = pool.map(partial(self.runNtrajectory, repeat=repeat) , [(x, y) for x, y in self.posIni])
     pool.close()
     pool.join()
     meanCost, meanTraj=0., 0.
     for Cost, traj in result:
         meanCost+=Cost
         meanTraj+=traj
     size = len(result)
     return meanCost/size, meanTraj/size
예제 #3
0
 def runMultiProcessTrajectories(self, repeat):
     pool = Pool(processes=len(self.posIni))
     result = pool.map(partial(self.nTraj, repeat=repeat),
                       [(x, y) for x, y in self.posIni])
     pool.close()
     pool.join()
     meanCost, meanTraj = 0, 0
     for CostCMAES, traj in result:
         meanCost += CostCMAES
         meanTraj += traj
     size = len(result)
     return meanCost / size, meanTraj / size
    def filter(dirty_data):
        log("starting filter")
        tpool = Pool(processes=cpus)
        ret = []
        log("filtering deleted and not english")
        for line in tpool.map(Filter.__is_not_deleted_or_not_non_english,
                              dirty_data):
            if line[1]:
                ret.append(line[0])

        def clean_links_and_punctuation(comment):
            words = comment.split(" ")
            words = list(map(Filter.__filter_links, words))
            comment = reduce(lambda x, y: x + " " + y, words)
            return comment

        log("filtering links and punctuation")
        ret = tpool.map(clean_links_and_punctuation, ret)
        tpool.close()
        log("filter done")
        return ret
예제 #5
0
 def run(self):
     nproc = PipeEnum.PARALLEL_N_PROCS.value
     nchunks = PipeEnum.PARALLEL_N_CHUNKS.value
     if nproc in self.kwargs:
         n_processes = self.kwargs[nproc]
         chunks = self.kwargs.get(nchunks, 1)
         pool = Pool(n_processes)
         self.output = [
             i for i in pool.map(
                 self.map_function, self.reader.data, chunksize=chunks)
         ]
     else:
         self.output = self.map_function(self.reader.data)
def clean_data():
    rows_per_loop = 100000
    log("")
    log("starting")

    dirty_db_path = ROOTDIR + dir_sep + "stage_2_clean.db"
    clean_db_path = ROOTDIR + dir_sep + "stage_3_cleaner.db"
    dirty_db_cursor = create_connection(dirty_db_path).cursor()
    clean_db = create_connection(clean_db_path)
    clean_db_cursor = clean_db.cursor()

    clean_db_cursor.execute("DELETE FROM bodies")
    clean_db_cursor.execute("delete from sqlite_sequence where name='bodies'")

    dirty_db_cursor.execute("select bodies from bodies")
    data = dirty_db_cursor.fetchmany(rows_per_loop)

    tpool = Pool(processes=4)
    locp_n = 1
    log("detected " + str(cpus) + " as cpu count")
    inserted = 0
    more_data = True
    while more_data:
        log("cleaning data")
        data = tpool.map(clean_line, data)
        data = list(map(lambda line: (line, ), data))

        log("inserting 100k rows")
        query = "insert into bodies (bodies) values (?)"
        clean_db_cursor.executemany(query, data)
        clean_db.commit()

        log("done loop, getting more data.")
        inserted += len(data)
        data = dirty_db_cursor.fetchmany(rows_per_loop)
        #more_data = False
        if len(data) < 1:
            more_data = False
            log("end of data")
        log("done " + str(locp_n) + " loops")
        locp_n += 1
    log("done")
    log("inserted " + str(inserted) + " rows")
예제 #7
0
    def runNtrajectory(self, (x, y), repeat):
        costAll, trajTimeAll = np.zeros(repeat), np.zeros(repeat)
        for i in range(repeat):
            costAll[i], trajTimeAll[i]  = self.runOneTrajectoryOpti(x, y) 
        meanCost = np.mean(costAll)
        meanTrajTime = np.mean(trajTimeAll)
        self.costStore.append([x, y, meanCost])
        self.trajTimeStore.append([x, y, meanTrajTime])
        return meanCost, meanTrajTime
    
    def mapableTrajecrtoryFunction(self,x,y,useless):
        return self.runOneTrajectory(x, y)
    
    def runNtrajectoryMulti(self, (x, y), repeat):
        pool=Pool(processes=4)
        result = pool.map(partial(self.mapableTrajecrtoryFunction,x,y) , range(repeat))
        pool.close()
        pool.join()
        meanCost, meanTraj=0., 0.
        for Cost, traj in result:
            meanCost+=Cost
            meanTraj+=traj
        size = len(result)
        return meanCost/size, meanTraj/size

    
    def runOneTrajectoryOpti(self, x, y):
        #self.tm.saveTraj = True
        cost, trajTime, lastX = self.tm.runTrajectoryOpti(x, y)
        #cost, trajTime, lastX = self.tm.runTrajectoryOpti(x, y)
        #print "Exp local x y cost : ", x, y, cost
예제 #8
0
def pre_stat(paras, df_microsatellites):
    # reference=paras["reference"]
    path_pre_stat = paras["output"].rstrip("/") + "/" + get_value(
        "case") + ".stat"
    path_pre_stat_tmp = paras["output_tmp"].rstrip("/") + "/" + get_value(
        "case") + ".stat"
    file_all_stat = open(path_pre_stat, "w")
    file_all_stat.write("\t".join([
        "repeat_unit_length", "repeat_times", "num_forward", "num_reversed",
        "this_repeat_mean_mean", "this_repeat_mean_std",
        "this_repeat_std_mean", "this_repeat_std_std", "forward_prefix",
        "forward_ms", "forward_suffix", "reversed_prefix", "reversed_ms",
        "reversed_suffix"
    ]) + "\n")

    df_microsatellites_download_sample = microsatellites_sampling(
        df_microsatellites, paras)

    for repeat_unit, info in df_microsatellites_download_sample.items():
        for repeat_times, ms_infos in info.items():
            logger.info("Processing   repeat unit: " + str(repeat_unit) +
                        " repeat times: " + str(repeat_times))
            infos = []
            for id, info in ms_infos.iterrows():
                info["reference"] = paras["reference"]
                info["prefix_len"] = paras["prefix_len"]
                info["suffix_len"] = paras["suffix_len"]
                infos.append(info)
            pool = Pool(processes=paras["threads"])
            res_infos = pool.map(process_one_ms, infos)
            pool.close()
            pool.join()
            suffix_str = "." + str(repeat_unit) + "." + str(repeat_times)
            file = open(path_pre_stat_tmp + suffix_str + ".repeat", "w")
            this_repeat_means = []
            this_repeat_stds = []
            num_forward = 0
            num_reversed = 0
            prefix_forward = []
            suffix_forward = []
            ms_forward = []
            prefix_reversed = []
            suffix_reversed = []
            ms_reversed = []
            for res in res_infos:
                if None not in res:
                    file.write("\t".join(map(str, res[:-2])) + "\n")
                    this_repeat_means.append(res[3])
                    this_repeat_stds.append(res[4])
                    prefix_forward.extend(res[-1]["prefix_forward"])
                    suffix_forward.extend(res[-1]["suffix_forward"])
                    ms_forward.extend(res[-1]["ms_forward"])
                    prefix_reversed.extend(res[-1]["prefix_reversed"])
                    suffix_reversed.extend(res[-1]["suffix_reversed"])
                    ms_reversed.extend(res[-1]["ms_reversed"])
                    num_forward += res[-1]["num_forward"]
                    num_reversed += res[-1]["num_reversed"]

            file.close()
            if num_forward + num_reversed < 2: continue
            this_repeat_mean_mean = np.mean(this_repeat_means)
            this_repeat_mean_std = np.std(this_repeat_means)
            this_repeat_std_mean = np.mean(this_repeat_stds)
            this_repeat_std_std = np.std(this_repeat_stds)
            pd.concat(
                [
                    pd.DataFrame(
                        [np.nanmean(np.array(prefix_forward), axis=0)]),
                    pd.DataFrame([np.nanmean(np.array(ms_forward), axis=0)]),
                    pd.DataFrame(
                        [np.nanmean(np.array(suffix_forward), axis=0)])
                ],
                axis=1,
            ).to_csv(path_pre_stat_tmp + suffix_str + ".forward.qual")
            pd.concat(
                [
                    pd.DataFrame(
                        [np.nanmean(np.array(prefix_reversed), axis=0)]),
                    pd.DataFrame([np.nanmean(np.array(ms_reversed), axis=0)]),
                    pd.DataFrame(
                        [np.nanmean(np.array(suffix_reversed), axis=0)])
                ],
                axis=1,
            ).to_csv(path_pre_stat_tmp + suffix_str + ".reversed.qual")
            forward_prefix = np.nanmean(prefix_forward)
            forward_ms = np.nanmean(ms_forward)
            forward_suffix = np.nanmean(suffix_forward)

            reversed_prefix = np.nanmean(prefix_reversed)
            reversed_ms = np.nanmean(ms_reversed)
            reversed_suffix = np.nanmean(suffix_reversed)
            this_info_list = list(
                map(str, [
                    repeat_unit, repeat_times, num_forward, num_reversed,
                    this_repeat_mean_mean, this_repeat_mean_std,
                    this_repeat_std_mean, this_repeat_std_std, forward_prefix,
                    forward_ms, forward_suffix, reversed_prefix, reversed_ms,
                    reversed_suffix
                ]))
            file_all_stat.write("\t".join(this_info_list) + "\n")
    file_all_stat.close()
    return