Exemplo n.º 1
0
def generate_feature_matrix(input_dir, input_files, input_files_id, format_,
                            caller, svtype_col_name, as_breakpoint,
                            definitions, output):
    """
    Generate feature matrix from VCF or BEDPE files.
    """
    if format_ == 'bedpe':
        if (input_dir is None) & (input_files is None):
            return
        elif (input_files is None):
            data = viola.read_bedpe_multi(input_dir,
                                          svtype_col_name=svtype_col_name)
        elif (input_dir is None):
            ls_input = input_files.split(',')
            ls_bedpe = [
                viola.read_bedpe(path, svtype_col_name=svtype_col_name)
                for path in ls_input
            ]
            if input_files_id is None:
                ls_names = range(len(ls_bedpe))
            else:
                ls_names = input_files_id.split(',')
            data = viola.MultiBedpe(ls_bedpe, ls_names)
        else:
            return
    else:
        if (input_dir is None) & (input_files is None):
            return
        elif (input_files is None):
            data = viola.read_vcf_multi(input_dir,
                                        variant_caller=caller,
                                        as_breakpoint=as_breakpoint)
        elif (input_dir is None):
            ls_input = input_files.split(',')
            if as_breakpoint:
                ls_vcf = [
                    viola.read_vcf(
                        path, variant_caller=caller).breakend2breakpoint()
                    for path in ls_input
                ]
            else:
                ls_vcf = [
                    viola.read_vcf(path, variant_caller=caller)
                    for path in ls_input
                ]

            if input_files_id is None:
                ls_names = range(len(ls_vcf))
            else:
                ls_names = input_files_id.split(',')
            data = viola.MultiBedpe(ls_vcf, ls_names)
        else:
            return

    result = data.classify_manual_svtype(definitions=definitions)
    result.to_csv(output, sep='\t')
Exemplo n.º 2
0
def test_classify_manual_svtype_from_file():
    bedpe1 = viola.read_bedpe(StringIO(data))
    bedpe2 = viola.read_bedpe(StringIO(data))
    ls_conditions = [
        small_del, large_del, small_dup, large_dup, small_inv, tra
    ]
    ls_names = [
        'small_del', 'large_del', 'small_dup', 'large_dup', 'small_inv', 'tra'
    ]
    multibedpe = viola.MultiBedpe([bedpe1, bedpe2], ['bedpe1', 'bedpe2'])
    path = os.path.join(HERE, '../bedpe/data/example_definition.txt')
    result = multibedpe.classify_manual_svtype(definitions=path)
    manual_sv_type = multibedpe.manual_sv_type
    manual_sv_type.set_index('id', inplace=True)
    manual_sv_type_expected = pd.read_csv(StringIO(data_expected),
                                          sep='\t',
                                          names=('id', 'value_idx',
                                                 'manual_sv_type'))
    manual_sv_type_expected.set_index('id', inplace=True)
    pd.testing.assert_frame_equal(manual_sv_type,
                                  manual_sv_type_expected,
                                  check_like=True)

    result_expected = pd.DataFrame([[2, 3, 1, 0, 2, 2, 2],
                                    [2, 3, 1, 0, 2, 2, 2]])
    result_expected.columns = ls_names + ['others']
    result_expected.columns.name = 'manual_sv_type'
    result_expected.index = ['bedpe1', 'bedpe2']
    result_expected.index.name = 'patients'
    pd.testing.assert_frame_equal(result, result_expected)
Exemplo n.º 3
0
def test_classify_manual_svtype_exclude_empty():
    bedpe1 = viola.read_bedpe(StringIO(data))
    bedpe2 = viola.read_bedpe(StringIO(data))
    empty1 = viola.read_bedpe(StringIO(data_empty))
    empty2 = viola.read_bedpe(StringIO(data_empty))
    ls_conditions = [
        small_del, large_del, small_dup, large_dup, small_inv, tra
    ]
    ls_names = [
        'small_del', 'large_del', 'small_dup', 'large_dup', 'small_inv', 'tra'
    ]
    multibedpe = viola.MultiBedpe([bedpe1, empty1, bedpe2, empty2],
                                  ['bedpe1', 'empty1', 'bedpe2', 'empty2'])
    result = multibedpe.classify_manual_svtype(ls_conditions=ls_conditions,
                                               ls_names=ls_names,
                                               exclude_empty_cases=True)
    manual_sv_type = multibedpe.manual_sv_type
    manual_sv_type.set_index('id', inplace=True)
    manual_sv_type_expected = pd.read_csv(StringIO(data_expected),
                                          sep='\t',
                                          names=('id', 'value_idx',
                                                 'manual_sv_type'))
    manual_sv_type_expected.set_index('id', inplace=True)
    pd.testing.assert_frame_equal(manual_sv_type,
                                  manual_sv_type_expected,
                                  check_like=True)

    result_expected = pd.DataFrame([[2, 3, 1, 0, 2, 2, 2],
                                    [2, 3, 1, 0, 2, 2, 2]])
    result_expected.columns = ls_names + ['others']
    result_expected.columns.name = 'manual_sv_type'
    result_expected.index = ['bedpe1', 'bedpe2']
    result_expected.index.name = 'patients'
    pd.testing.assert_frame_equal(result, result_expected)
Exemplo n.º 4
0
    def merge(self,
              ls_bedpe=[],
              ls_caller_names=None,
              threshold=100,
              linkage="complete",
              str_missing=True):
        """
        merge(ls_bedpe:list, ls_caller_names:list, threshold:float, linkage = "complete", str_missing=True)
        Return a merged bedpe object from mulitple  caller's bedpe objects in ls_bedpe

        Parameters
        ----------
        ls_bedpe:list
            A list of bedpe objects to be merged, which are the same order with ls_caller_names
        ls_caller_names:list
            A list of names of bedpe objects to be merged, which should have self's name as the first element
        threshold:float
            Two SVs whose diference of positions is under this threshold are cosidered to be identical.
        linkage:{‘complete’, ‘average’, ‘single’}, default=’complete’
            The linkage of hierarchical clustering.
            To keep the mutual distance of all SVs in each cluster below the threshold, 
            "complete" is recommended.
        str_missing:boolean, default="True"
            If True, all the missing strands are considered to be identical to the others. 

        Returns
        ----------
        A merged bedpe object
        """
        if self in ls_bedpe:
            pass
        else:
            ls_bedpe = [self] + ls_bedpe

        multibedpe = viola.MultiBedpe(ls_bedpe, ls_caller_names)
        distance_matrix = self._generate_distance_matrix_by_distance(
            multibedpe, penalty_length=3e9, str_missing=str_missing)
        hcl_clustering_model = AgglomerativeClustering(
            n_clusters=None,
            affinity="precomputed",
            linkage=linkage,
            distance_threshold=threshold)
        labels = hcl_clustering_model.fit_predict(X=distance_matrix)

        positions_table = multibedpe.get_table("positions")
        mergedid_dict = {labels[0]: 0}
        ls_mergedid = []
        idx_head = 0
        for label in labels:
            if label in mergedid_dict:
                ls_mergedid.append(mergedid_dict[label])
            else:
                idx_head += 1
                mergedid_dict[label] = idx_head
                ls_mergedid.append(mergedid_dict[label])

        N = len(positions_table)
        value_idx = pd.Series(np.zeros(N, dtype=int))
        df_mergedid = pd.DataFrame({
            "id": positions_table["id"],
            "value_idx": value_idx,
            "mergedid": pd.Series(ls_mergedid)
        })

        originalid = multibedpe.get_table("global_id")["id"]
        df_originalid = pd.DataFrame({
            "id": positions_table["id"],
            "value_idx": value_idx,
            "originalid": originalid
        })

        ############## Edited by Sugita ##################
        df_id = multibedpe.get_table("global_id")
        df_patients = multibedpe.get_table("patients")
        df_id_patients = df_id.merge(df_patients,
                                     left_on="patient_id",
                                     right_on="id")
        caller = df_id_patients["patients"]
        df_caller = pd.DataFrame({
            "id": positions_table["id"],
            "value_idx": value_idx,
            "caller": caller
        })
        ############## /Edited by Sugita #################

        df_svpos = multibedpe._df_svpos
        odict_df_info = multibedpe._odict_df_info

        merged_bedpe = viola.Bedpe(df_svpos=df_svpos,
                                   odict_df_info=odict_df_info)
        merged_bedpe.add_info_table(table_name="mergedid", df=df_mergedid)
        merged_bedpe.add_info_table(table_name="originalid", df=df_originalid)
        merged_bedpe.add_info_table(table_name="caller", df=df_caller)

        return merged_bedpe
def test_read_bedpe_with_empty():
    bedpe1 = viola.read_bedpe(StringIO(data))
    bedpe2 = viola.read_bedpe(StringIO(data))
    bedpe_empty = viola.read_bedpe(StringIO(data_empty))
    multibedpe = viola.MultiBedpe([bedpe1, bedpe2, bedpe_empty], ['bedpe1', 'bedpe2', 'empty'])