示例#1
0
	def on_track(self, line):
		"""Are we still on track (that is, is our distance to the given line
		small enough? If not, better re-navigate"""
		if line is None:
			return False
		else:
			line = Line(*line)
			return line.distance(self.tracker.getPosition()) < THRESHOLD
示例#2
0
    def _select_optimal_threshold(self, thresholds: np.ndarray,
                                  spot_counts: List[int]) -> float:

        # calculate the gradient of the number of spots
        grad = np.gradient(spot_counts)
        self._grad = grad
        optimal_threshold_index = np.argmin(grad)

        # only consider thresholds > than optimal threshold
        thresholds = thresholds[optimal_threshold_index:]
        grad = grad[optimal_threshold_index:]

        # if all else fails, return 0.
        selected_thr = 0

        if len(thresholds) > 1:

            distances = []

            # create a line whose end points are the threshold and and corresponding gradient value
            # for spot_counts corresponding to the threshold
            start_point = Point(thresholds[0], grad[0])
            end_point = Point(thresholds[-1], grad[-1])
            line = Line(start_point, end_point)

            # calculate the distance between all points and the line
            for k in range(len(thresholds)):
                p = Point(thresholds[k], grad[k])
                dst = line.distance(p)
                distances.append(dst.evalf())

            # remove the end points
            thresholds = thresholds[1:-1]
            distances = distances[1:-1]

            # select the threshold that has the maximum distance from the line
            # if stringency is passed, select a threshold that is n steps higher, where n is the
            # value of stringency
            if distances:
                thr_idx = np.argmax(np.array(distances))

                if thr_idx + self.stringency < len(thresholds):
                    selected_thr = thresholds[thr_idx + self.stringency]
                else:
                    selected_thr = thresholds[thr_idx]

        return selected_thr
 def Choose_cluster_number_distance(self, dict1):
     list_2 = sorted(dict1.items(), key=lambda item: item[0])
     point1 = Point(list_2[0])
     point2 = Point(list_2[-1])
     line = Line(point1, point2)
     Q = 1
     dis = {}
     for p in list_2:
         P = Point(p)
         D = line.distance(P)
         dis[Q] = D
         Q = Q + 1
     list_1 = sorted(dis.items(), key=lambda item: item[1], reverse=True)
     h_sc = list_1[0]
     cluster_number = h_sc[0]
     # higher_score = h_sc[1]
     logger.info("Best K value is" + str(cluster_number))
     return cluster_number
示例#4
0
def get_main_content(content):
    string_helper = StringHelper()
    time_start = time.time()
    content = string_helper.replace_unclose_br_tag(content)
    content = string_helper.replace_unclose_img_tag(content)
    content = string_helper.replace_unclose_input_tag(content)
    soup = BeautifulSoup(content, 'html.parser')
    soup_backup = BeautifulSoup(content, 'html.parser')

    soup_helper = SoupHelper()

    [s.extract() for s in soup('script')]
    [s.extract() for s in soup('iframe')]
    [s.extract() for s in soup('a')]
    [s.extract() for s in soup('header')]
    [s.extract() for s in soup('label')]
    [s.extract() for s in soup('option')]
    [s.extract() for s in soup('head')]
    [s.extract() for s in soup('footer')]
    [s.extract() for s in soup('style')]
    [s.extract() for s in soup('noscript')]
    [s.extract() for s in soup('video')]
    [s.extract() for s in soup('videolist')]
    [s.extract() for s in soup('source')]
    [s.extract() for s in soup('track')]

    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]

    l_content_length = []
    l_content = []
    l_path = []
    d_text_path = get_text_paths(soup)[0]

    def get_node_position(node: str):
        return int(node.split('_')[-1])

    def get_path_score(path: str) -> int:
        score = 0
        l_node = path.split(' ')
        l_node.reverse()

        for index, node in enumerate(l_node):
            pos = get_node_position(node)
            score += ((3**index) * pos)

        return score

    for text_path in d_text_path:
        if text_path.strip() == 'text_node':
            continue

        if len(d_text_path[text_path].strip()) < 50:
            continue

        path = soup_helper.convert_text_path_to_path(text_path)
        l_path.append(path)
        l_content_length.append(len(d_text_path[text_path]))
        l_content.append(d_text_path[text_path])

    # for content in l_content:
    #     print(content)
    #     print("=========")

    X = []
    plot_x = []
    plot_y = []
    for index, element in enumerate(l_content_length):
        X.append([get_path_score(l_path[index]), element])
        plot_x.append(get_path_score(l_path[index]))
        plot_y.append(element)

    X = np.array(X)

    plot_score = []
    plot_num = []
    highest_score = 0
    total_case = len(X) if len(X) < 10 else 10
    for i in range(1, total_case, 1):
        current_kmeans = k_means(X, i)
        if i == 1:
            highest_score = current_kmeans[2]
            num_rate = highest_score / 10

        plot_score.append(round(current_kmeans[2], 2))
        plot_num.append(i * num_rate)

    p1 = Point(plot_num[0], plot_score[0], evaluate=False)
    p2 = Point(plot_num[-1], plot_score[-1], evaluate=False)
    line = Line(p1, p2, evaluate=False)
    highest_score = 0
    k = 1

    for i, _ in enumerate(plot_score):
        if i > 1:
            A = Point((plot_num[i - 1], plot_score[i - 1]), evaluate=False)
            B = (plot_num[i - 2], plot_score[i - 2])
            distance = float(line.distance(A))
            if highest_score < distance:
                highest_score = distance
                k = i

    kmeans = k_means(X, k)
    plot_cent_x = kmeans[0][:, 0]
    plot_cent_y = kmeans[0][:, 1]

    l_label = kmeans[1]
    d_label = {}

    for index, label in enumerate(l_label):
        if label not in d_label:
            d_label[label] = []
        d_label[label].append(l_content_length[index])

    candidate_label = 0
    highest_score = 0
    for label in d_label:
        # score = (sum(d_label[label]) / len(d_label[label])) * math.log(len(d_label[label]) + 1, 5) * math.log10(sum(d_label[label]))
        score = sum(d_label[label])
        if score > highest_score:
            highest_score = score
            candidate_label = label

    l_result = []
    l_result_path = []
    for index, label in enumerate(l_label):
        if label == candidate_label:
            l_result.append(l_content[index])
            l_result_path.append(l_path[index])

    def get_common_path(l_path):
        laboratory_path = l_path[0]

        while True:
            l_laboratory_node = laboratory_path.split(' ')
            l_laboratory_node = l_laboratory_node[:len(l_laboratory_node) - 1]
            laboratory_path = ' '.join(l_laboratory_node)

            flag = True
            for path in l_path:
                if laboratory_path not in path:
                    flag = False
                    break
            if flag == True:
                return laboratory_path

    # plt.scatter(plot_x, plot_y, s=3)
    # plt.scatter(plot_cent_x, plot_cent_y, color='red', s=4)
    # plt.show()
    return ' '.join(l_result)
示例#5
0
def thr_calculator(filtered_img, min_distance, stringency):
    """
    Function used to calculate the threshold to use for the dots
    counting in a 2D image. 

    Parameters:
    -----------

    filtered_img: np.array float64
        preprocessed image used to count the dots.
    min_distance: int
        minimum distance that two maxima need to have in order to be defined as 
        separete peaks.
    stringency: int
        integer used to select the stringency of the generated
        threshold. By adding stringency to the thr_idx we can select a Thr with higher
        value from the thr_array.

    Returns:
    -----------

    counting_dict : dict 
        dictionary containing all the counting infos:
        selected_thr: float64
            Thr used for counting after application of the stringency.
        calculated_thr: float64 
            Calculated Thr
        selected_peaks: int64 
            2D coords of the peaks defined using the selected_thr.
        thr_array: float64 
            Thr array of 100 points distributed between (Img.min(),Img.max()).
        peaks_coords: float64 
            list of all the 3D coords calculated using the Thr array.
        total_peaks: list of int 
            List of the peaks counts.
        thr_idx: int64 
            index of the calculated threshold.
        stringency: int64 
            stringency used for the identification of the selected_peaks
    """

    # List with the total peaks calculated for each threshold
    total_peaks = []

    # List of ndarrays with the coords of the peaks calculated for each threshold
    peaks_coords = []

    # Define the Thr array to be tested
    thr_array = np.linspace(filtered_img.min(), filtered_img.max(), num=100)

    # Calculate the number of peaks for each threshold. In this calculation
    # the size of the objects is not considered
    for thr in thr_array:
        # The border is excluded from the counting
        peaks = feature.peak_local_max(filtered_img,min_distance=min_distance,\
            threshold_abs=thr,exclude_border=False, indices=True,\
            num_peaks=np.inf, footprint=None,labels=None)
        # Stop the counting when the number of peaks detected falls below 3
        if len(peaks) <= 3:
            stop_thr = thr  # Move in the upper loop so you will stop at the previous thr
            break
        else:
            peaks_coords.append(peaks)
            total_peaks.append(len(peaks))

    # Consider the case of no detectected peaks or if there is only one Thr
    # that create peaks (list total_peaks have only one element and )
    # if np.array(total_peaks).sum()>0 or len(total_peaks)>1:
    if len(total_peaks) > 1:

        # Trim the threshold array in order to match the stopping point
        # used the [0][0] to get the first number and then take it out from list
        thr_array = thr_array[:np.where(thr_array == stop_thr)[0][0]]

        # Calculate the gradient of the number of peaks distribution
        grad = np.gradient(total_peaks)

        # Restructure the data in order to avoid to consider the min_peak in the
        # calculations

        # Coord of the gradient min_peak
        grad_min_peak_coord = np.argmin(grad)

        # Trim the data to remove the peak.
        trimmed_thr_array = thr_array[grad_min_peak_coord:]
        trimmed_grad = grad[grad_min_peak_coord:]

        if trimmed_thr_array.shape > (1, ):

            # Trim the coords array in order to maintain the same length of the
            # tr and pk
            trimmed_peaks_coords = peaks_coords[grad_min_peak_coord:]
            trimmed_total_peaks = total_peaks[grad_min_peak_coord:]

            # To determine the threshold we will determine the Thr with the biggest
            # distance to the segment that join the end points of the calculated
            # gradient

            # Distances list
            distances = []

            # Calculate the coords of the end points of the gradient
            p1 = Point(trimmed_thr_array[0], trimmed_grad[0])
            p2 = Point(trimmed_thr_array[-1], trimmed_grad[-1])

            # Create a line that join the points
            s = Line(p1, p2)
            allpoints = np.arange(0, len(trimmed_thr_array))

            # Calculate the distance between all points and the line
            for p in allpoints:
                dst = s.distance(Point(trimmed_thr_array[p], trimmed_grad[p]))
                distances.append(dst.evalf())

            # Remove the end points from the lists
            trimmed_thr_array = trimmed_thr_array[1:-1]
            trimmed_grad = trimmed_grad[1:-1]
            trimmed_peaks_coords = trimmed_peaks_coords[1:-1]
            trimmed_total_peaks = trimmed_total_peaks[1:-1]
            trimmed_distances = distances[1:-1]

            # Determine the coords of the selected Thr
            # Converted trimmed_distances to array because it crashed
            # on Sanger.
            if trimmed_distances:  # Most efficient way will be to consider the length of Thr list
                thr_idx = np.argmax(np.array(trimmed_distances))
                calculated_thr = trimmed_thr_array[thr_idx]
                # The selected threshold usually causes oversampling of the number of dots
                # I added a stringency parameter (int n) to use to select the Thr+n
                # for the counting. It selects a stringency only if the trimmed_thr_array
                # is long enough
                if thr_idx + stringency < len(trimmed_thr_array):
                    selected_thr = trimmed_thr_array[thr_idx + stringency]
                    selected_peaks = trimmed_peaks_coords[thr_idx + stringency]
                    thr_idx = thr_idx + stringency
                else:
                    selected_thr = trimmed_thr_array[thr_idx]
                    selected_peaks = trimmed_peaks_coords[thr_idx]

                # Calculate the selected peaks after removal of the big and small objects

                # Threshold the image using the selected threshold
                if selected_thr > 0:
                    img_mask = filtered_img > selected_thr

                labels = nd.label(img_mask)[0]

                properties = measure.regionprops(labels)

                for ob in properties:
                    if ob.area < 6 or ob.area > 200:
                        img_mask[ob.coords[:, 0], ob.coords[:, 1]] = 0

                labels = nd.label(img_mask)[0]
                selected_peaks = feature.peak_local_max(
                    filtered_img,
                    min_distance=min_distance,
                    threshold_abs=selected_thr,
                    exclude_border=False,
                    indices=True,
                    num_peaks=np.inf,
                    footprint=None,
                    labels=labels)

                if selected_peaks.size:
                    # Intensity counting of the max peaks
                    selected_peaks_int = filtered_img[selected_peaks[:, 0],
                                                      selected_peaks[:, 1]]

                else:
                    selected_thr = 0
                    calculated_thr = 0
                    selected_peaks = 0
                    peaks_coords = 0
                    total_peaks = 0
                    thr_idx = 0
                    selected_peaks_int = 0
                    trimmed_thr_array = 0
                    trimmed_peaks_coords = 0

            else:
                selected_thr = 0
                calculated_thr = 0
                selected_peaks = 0
                peaks_coords = 0
                total_peaks = 0
                thr_idx = 0
                selected_peaks_int = 0
                trimmed_thr_array = 0
                trimmed_peaks_coords = 0
        else:
            selected_thr = 0
            calculated_thr = 0
            selected_peaks = 0
            peaks_coords = 0
            total_peaks = 0
            thr_idx = 0
            selected_peaks_int = 0
            trimmed_thr_array = 0
            trimmed_peaks_coords = 0

    else:
        selected_thr = 0
        calculated_thr = 0
        selected_peaks = 0
        peaks_coords = 0
        total_peaks = 0
        thr_idx = 0
        selected_peaks_int = 0
        trimmed_thr_array = 0
        trimmed_peaks_coords = 0

    counting_dict = {}

    counting_dict['selected_thr'] = selected_thr
    counting_dict['calculated_thr'] = calculated_thr
    counting_dict['selected_peaks'] = selected_peaks
    counting_dict['thr_array'] = thr_array
    counting_dict['trimmed_thr_array'] = trimmed_thr_array
    counting_dict['peaks_coords'] = peaks_coords
    counting_dict['trimmed_peaks_coords'] = trimmed_peaks_coords
    counting_dict['total_peaks'] = total_peaks
    counting_dict['thr_idx'] = thr_idx
    counting_dict['stringency'] = stringency
    counting_dict['selected_peaks_int'] = selected_peaks_int

    return counting_dict