Пример #1
0
def calc(samples, window=DEFAULT_WINDOW, overlap=DEFAULT_WINDOW_OVERLAP):
    """Calculation of shot noise model parameters.

    :param samples: A pandas dataframe with columns called 'distance' and 'force'.
    :param window: Size of moving window.
    :param overlap: Overlap factor in percent.
    :return: Pandas dataframe with the columns 'distance', 'force_median',
             'L2012_lambda', 'L2012_f0', 'L2012_delta', 'L2012_L'.
    """

    # Calculate spatial resolution of the distance samples as median of all
    # step sizes.
    spatial_res = np.median(np.diff(samples.distance.values))

    # Split dataframe into chunks
    chunks = chunkup(samples, window, overlap)
    result = []
    for center, chunk in chunks:
        f_median = np.median(chunk.force)
        sn = calc_step(spatial_res, chunk.force)
        result.append((center, f_median) + sn)
    return pd.DataFrame(result,
                        columns=[
                            'distance', 'force_median', 'L2012_lambda',
                            'L2012_f0', 'L2012_delta', 'L2012_L'
                        ])
Пример #2
0
def get_sample_map(delta_fname, x_coverage, average_read_length, rate_param):
    lengthdb = read_pickle(lengthdbpath)
    bin_size = int(rate_param / float(x_coverage))
    with open(delta_fname) as inf:
        delta = ujson.load(inf, precise_float=True)
    bacid_maps = dict()
    for _, mapngs in delta:
        for dest_id, pos1, pos2, used_koef, _ in mapngs:
            if dest_id not in bacid_maps:
                bacid_maps[dest_id] = np.zeros(
                    int(lengthdb[dest_id] / bin_size) + 1)
            ind1 = int((pos1 + (average_read_length / 2)) / bin_size)
            if pos2 >= 0:
                used_koef = used_koef / 2.0
                ind2 = int((pos2 + (average_read_length / 2)) / bin_size)
                bacid_maps[dest_id][ind2] += used_koef
            bacid_maps[dest_id][ind1] += used_koef
    return {dest_id:cov_map for dest_id, cov_map in bacid_maps.iteritems()\
                           if np.median(cov_map) >= rate_param}
Пример #3
0
    def run_methods(self):
        results = defaultdict(list)
        # We only test the methods common to all converters
        # (The intended use is with a list of converters all
        # having the same methods, but different input files)
        methods = set(self.converters[0].available_methods[:])  # a copy !
        for converter in self.converters[1:]:
            methods &= set(converter.available_methods[:])
        methods = sorted(methods)

        if self.include_dummy:
            methods += ['dummy']

        if self.to_include:
            methods = [x for x in methods if x in self.to_include]
        elif self.to_exclude:
            methods = [x for x in methods if x not in self.to_exclude]

        for method in methods:
            print("\nEvaluating method %s" % method)
            # key: converter.infile
            # value: list of times
            times = defaultdict(list)
            pb = Progress(self.N)
            for i in range(self.N):
                for converter in self.converters:
                    with Timer(times[converter.infile]):
                        converter(method=method)
                pb.animate(i + 1)
            # Normalize times so that each converter has comparable times
            mean_time = gmean(np.fromiter(chain(*times.values()), dtype=float))
            # median of ratios to geometric mean (c.f. DESeq normalization)
            scales = {
                conv: np.median(np.asarray(conv_times) / mean_time)
                for conv, conv_times in times.items()
            }
            for (conv, conv_times) in times.items():
                scale = scales[conv]
                results[method].extend(
                    [conv_time / scale for conv_time in conv_times])
        self.results = results
Пример #4
0
        df_words = pd.DataFrame(matrix.toarray(), columns=feature_names, index=df_movies.index)
        df_words.sum().sort_values().to_dict()

        # target = df_movies.award_noms_oscar >= OSCARS_MIN
        target = df_movies[target_feature]

        oa.create_wordcloud(df_words.loc[target == 0], 'nontarget')
        oa.create_wordcloud(df_words.loc[target == 1], 'target')
        classifier = \
            RandomForestClassifier(n_estimators=n_estimators,
                                   min_samples_split=50,
                                   min_samples_leaf=15,
                                   max_depth=3). \
                fit(df_words, target)

        cv = np.median(cross_val_score(classifier, df_words, target, scoring='roc_auc', cv=10))
        auc = roc_auc_score(target, classifier.predict_proba(df_words)[:, 1])
        mlflow.log_metric('auc_training', auc)
        mlflow.log_metric("auc_cv10_median", cv)

        df_importance = pd.DataFrame(classifier.feature_importances_, columns=['term'], index=feature_names)
        df_words_expl = df_words# .copy()
        df_words_expl.loc[:, 'target'] = target
        df_words_expl = df_words_expl.groupby("target").mean().transpose()

        oa.create_importance_plot(df_importance, df_words_expl)

        mlflow.log_artifacts('./charts')
        del df_words_expl
        del df_movies
    logging.debug("Checking the distances...")
    real_distances, real_distances_bearing = analise_distances(
        first_path, "0/", True)
    # 0 np.max(array), 1 np.min(array), 2 np.mean(array), 3 np.std(array), 4 np.median(array)

    # single graphs
    trajectories = len(real_distances[0].keys())
    num = np.arange(0, trajectories)
    real_total = []
    number_tra = []
    for tra in range(trajectories):
        x = []
        x = np.arange(0, len(real_distances))
        median = []
        for el in real_distances:
            median.append(np.median(np.array(el[tra][1])))

        median_bearing = []
        for el in real_distances_bearing:
            median_bearing.append(np.median(np.array(el[tra][1])))

        total_sum_median = []
        for i in range(len(median)):
            total_sum_median.append(median[i] * 100 + median_bearing[i])

        for el in total_sum_median:
            real_total.append(el)
            number_tra.append(tra)

    # x = np.arange(0, len(real_distances))
    # total_x = []
Пример #6
0
def analise_distances(path, number, bigOrSmall):
    path = path + "/" + str(number) + "/"
    names = []
    for i in os.listdir(path):
        if bigOrSmall:
            name_to_check = "trajectory-generatedPoints-"
        else:
            name_to_check = "trajectory-generate-aSs-"
        # if os.path.isfile(os.path.join(path, i)) and 'trajectory-generatedPoints-' in i and ".zip" in i:
        if os.path.isfile(os.path.join(
                path, i)) and name_to_check in i and ".zip" in i:
            names.append(i)

    names = sorted_nicely(names)

    numb = 0

    total_distances_angle = []
    total_distances = []

    logging.debug("Analysing Trajectories...")
    for i in tqdm.tqdm(range(len(names))):
        name = names[i]

        trajectories_label, json_file = rean_info(path + name)

        # ----------- distance bearings

        # real points
        lat_real = []
        lng_real = []
        # generated points
        lat_generated = []
        lng_generated = []

        label_real = []
        label_generated = []
        label_trajectory = []

        # last point trajectory
        lat_last = []
        lng_last = []
        for labels in trajectories_label:
            for el in json_file[labels]["real"]:
                if el[0] not in lat_real:
                    lat_real.append(el[0])
                    lng_real.append(el[1])
                    label_real.append(json_file[labels]["id"])

            for el in json_file[labels]["generated"]:
                lat_generated.append(el[0])
                lng_generated.append(el[1])
                label_generated.append(json_file[labels]["id"])

            appo_lat = []
            appo_lgn = []
            for el in json_file[labels]["trajectory"]:
                appo_lat.append(el[0])
                appo_lgn.append(el[1])

            lat_last.append(appo_lat[len(appo_lat) - 1])
            lng_last.append(appo_lgn[len(appo_lgn) - 1])
            label_trajectory.append(json_file[labels]["id"])

        distance_per_trajectories = {}

        # for the trajectories I have
        for i in range(len(label_real)):

            # compute real bearing for the current trajectory
            real_bearing = compute_bearing(lat_last[i], lng_last[i],
                                           lat_real[i], lng_real[i])

            # find index of the point generated corresponding to this trajectory
            index = [
                j for j, x in enumerate(label_generated) if x == label_real[i]
            ]

            index_last_point = [
                j for j, x in enumerate(label_trajectory) if x == label_real[i]
            ]

            distances = []
            for ind in index:
                bearing = compute_bearing(lat_last[index_last_point[0]],
                                          lng_last[index_last_point[0]],
                                          lat_generated[ind],
                                          lng_generated[ind])
                distances.append(fabs(bearing - real_bearing))
            array = np.array(distances)

            distance_per_trajectories.update({
                i:
                (np.max(array), np.min(array), np.mean(array), np.std(array),
                 np.median(array))
            })
        total_distances_angle.append(distance_per_trajectories)

        # ----------- distance points

        # real points
        lat_real = []
        lng_real = []
        # generated points
        lat_generated = []
        lng_generated = []

        label_real = []
        label_generated = []
        for labels in trajectories_label:
            for el in json_file[labels]["real"]:
                if el[0] not in lat_real:
                    lat_real.append(el[0])
                    lng_real.append(el[1])
                    label_real.append(json_file[labels]["id"])

            for el in json_file[labels]["generated"]:
                if el[0] not in lat_generated:
                    lat_generated.append(el[0])
                    lng_generated.append(el[1])
                    label_generated.append(json_file[labels]["id"])

        distance_per_trajectories = {}
        # now for every trajectory compute the distance of the generated distance
        for i in range(len(label_real)):
            index = [
                j for j, x in enumerate(label_generated) if x == label_real[i]
            ]
            distances = []
            for ind in index:
                distances.append(
                    float(
                        compute_distance(lat_real[i], lng_real[i],
                                         lat_generated[ind],
                                         lng_generated[ind])))

            array = np.array(distances)
            distance_per_trajectories.update({
                i:
                (np.max(array), np.min(array), np.mean(array), np.std(array),
                 np.median(array))
            })
        total_distances.append(distance_per_trajectories)

        numb += 1
    return total_distances, total_distances_angle