Пример #1
0
 def load(self):
     # Most of reference sequences are short enough to not be split
     self.annotation_file = self.refdata.get_sequence_dict()["annotation"]
     print(f"Using the annotation file: '{self.annotation_file}'")
     self.annotation_df = load_tsv(self.annotation_file)
     print(f"Loaded annotation table with shape {self.annotation_df.shape}")
     self._annotation_df = self.annotation_df.copy()
     self.raw_nucleotide_fasta_headers = self.annotation_df[
         "former_id"].values.tolist()
        _namespace.separator,
        _namespace.output_file,
    )


if __name__ == '__main__':
    (
        target_file,
        target_column_name,
        source_table_file,
        source_column_names,
        separator,
        out_file,
    ) = _parse_args()

    source_df = load_tsv(source_table_file)
    source_sub_df = pd.DataFrame()
    source_sub_df["target"] = source_df[target_column_name].copy()
    source_sub_df["source"] = source_df.loc[:, source_column_names].fillna(
        "").agg(separator.join, axis=1)
    source_sub_df = source_sub_df.loc[
        source_sub_df["source"].map(lambda x: len(x) > 0), :]

    target_string = load_string(target_file)

    collector_string = str(target_string)
    counter = 0
    for source, target in source_sub_df.values:
        collector_string = collector_string.replace(str(source), target)
        counter += 1
Пример #3
0
        choices=["inner", "outer"],
        help="(Optional) How to handle indexes on other axis (or axes)")
    parser.add_argument("-o", "--output", required=True, help="Output table")
    _namespace = parser.parse_args()
    return _namespace.input, _namespace.axis, _namespace.index, _namespace.join, _namespace.output


if __name__ == '__main__':
    input_tables, axis, index, join, output_table = parse_args()
    table_files = remove_empty_values(
        [i for i in input_tables if is_file_valid(i)])
    if len(table_files) == 0:
        raise ValueError("No valid tables!")
    dataframes = []
    for table_file in table_files:
        dataframe = load_tsv(table_file)
        if dataframe.shape[0] == 0:
            continue
        dataframe[PATH_COL_NAME] = table_file
        dataframes.append(dataframe)

    is_index = len(index) > 0
    if is_index:
        for dataframe in dataframes:
            dataframe.set_index(index, inplace=True)

    print(
        f"Concatenate {len(dataframes)} dataframes with the shapes: {[i.shape for i in dataframes]}"
    )
    start = perf_counter()
    out_df = pd.concat(dataframes,
                        required=True,
                        help="File with text to perform replacements")
    parser.add_argument("-t",
                        "--table",
                        required=True,
                        help="Table with values to fetch")
    parser.add_argument("-o", "--output", required=True, help="Output file")
    _namespace = parser.parse_args()
    return _namespace.input, _namespace.table, _namespace.output


if __name__ == '__main__':
    text_file, combined_blast_result_file, out_file = parse_args()

    text_content = load_string(text_file)
    combined_blast_result_df = load_tsv(combined_blast_result_file).set_index(
        "geninfo_id")
    print(
        f"Loaded replacer table with the shape {combined_blast_result_df.shape}"
    )

    renaming_dict = combined_blast_result_df["strain"].map(
        lambda x: str(x).strip()).to_dict()

    text_content_replaced = re.sub("\.(gbk|gff)", "", text_content)
    counter = 0
    for renaming_key, renaming_value in renaming_dict.items():
        text_content_replaced = text_content_replaced.replace(
            *[str(i) for i in [renaming_key, renaming_value]])
        counter += 1

    print(f"{counter} replacements were performed")
Пример #5
0
                        required=True,
                        help="Output file")
    _namespace = parser.parse_args()
    return (_namespace.rgi_dir, _namespace.card_version, _namespace.nbee_dir,
            _namespace.output_file)


if __name__ == '__main__':
    (rgi_dir, card_version, nbee_dir, out_file) = _parse_args()
    # RGI
    rgi_tables = find_file_by_tail(dir_name=rgi_dir,
                                   multiple=True,
                                   tail=".txt")
    merged_rgi_df = pd.DataFrame()
    for rgi_table in rgi_tables:
        rgi_df = load_tsv(rgi_table)
        if rgi_df.shape[0] == 0:
            continue
        rgi_df = remove_longest_columns(rgi_df, CELL_SIZE_LIMIT)
        columns = rgi_df.columns.tolist()
        rgi_df["sample_name"] = filename_only(rgi_table)
        rgi_df = rgi_df.loc[:, ["sample_name"] + columns]
        print(
            f"Concatenate dataframes with shapes {rgi_df.shape}, {merged_rgi_df.shape}"
        )
        merged_rgi_df = pd.concat([merged_rgi_df, rgi_df],
                                  axis=0,
                                  ignore_index=True)
    reference_name = "_".join(remove_empty_values(["card", card_version]))
    print(f"Finished concatenating tables for '{reference_name}'")
    sheets = {reference_name: merged_rgi_df}