Exemplos de Utilities.dict2pd_series em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: meta.scripts.Utilities

Classe / Tipo: Utilities

Método / Função: dict2pd_series

Exemplos em hotexamples.com: 4

Utilities.dict2pd_series em Python - 4 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de meta.scripts.Utilities.Utilities.dict2pd_series em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

safe_findall(14)

dump_tsv(12)

scan_whole_dir(11)

load_tsv(10)

multi_core_queue(9)

remove_empty_values(8)

ends_with_slash(8)

dump_string(6)

single_core_queue(6)

left_merge(5)

merge_pd_series_list(4)

dict2pd_series(4)

dump_2d_array(3)

download_file(3)

filename_only(3)

is_file_valid(2)

remove_duplicate_sequences(2)

scrap_links_from_web_page(2)

load_string(2)

load_list(2)

combine_duplicate_rows(2)

get_n_majors_from_df(2)

concatenate_files(2)

flatten_2d_array(2)

dump_list(2)

get_most_similar_word_pairs(2)

get_time(1)

join_lines(1)

load_2d_array(1)

find_file_by_tail(1)

ls(1)

dump_dict(1)

decompress_file(1)

count_reads_statistics(1)

count_raw_reads_statistics(1)

count_assembly_statistics(1)

count_assembly_coverages(1)

Métodos Frequentes

safe_findall (14)

dump_tsv (12)

scan_whole_dir (11)

load_tsv (10)

multi_core_queue (9)

remove_empty_values (8)

ends_with_slash (8)

dump_string (6)

single_core_queue (6)

left_merge (5)

Métodos Frequentes

merge_pd_series_list (4)

dict2pd_series (4)

dump_2d_array (3)

download_file (3)

filename_only (3)

is_file_valid (2)

remove_duplicate_sequences (2)

scrap_links_from_web_page (2)

load_string (2)

load_list (2)

combine_duplicate_rows (2)

get_n_majors_from_df (2)

concatenate_files (2)

flatten_2d_array (2)

dump_list (2)

get_most_similar_word_pairs (2)

get_time (1)

join_lines (1)

load_2d_array (1)

find_file_by_tail (1)

Métodos Frequentes

combine_duplicate_rows (2)

get_n_majors_from_df (2)

concatenate_files (2)

flatten_2d_array (2)

dump_list (2)

get_most_similar_word_pairs (2)

get_time (1)

join_lines (1)

load_2d_array (1)

find_file_by_tail (1)

ls (1)

dump_dict (1)

decompress_file (1)

count_reads_statistics (1)

count_raw_reads_statistics (1)

count_assembly_statistics (1)

count_assembly_coverages (1)

Métodos Frequentes

ls (1)

dump_dict (1)

decompress_file (1)

count_reads_statistics (1)

count_raw_reads_statistics (1)

count_assembly_statistics (1)

count_assembly_coverages (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: ReferenceDescriber.py Projeto: ivasilyev/curated_projects

def annotate(self): self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_nfasta_header, raw_nfasta_headers) ] self._processed_nfasta_df = Utilities.merge_pd_series_list( processed_nfasta_headers).sort_values("former_id") zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist())) # Join table assembled from pFASTA headers raw_pfasta_headers = [] with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f: for _line in _f: if _line.startswith(">"): raw_pfasta_headers.append(re.sub("^>", "", _line).strip()) _f.close() raw_pfasta_headers = sorted( set([i for i in raw_pfasta_headers if len(i) > 0])) processed_pfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_pfasta_header, raw_pfasta_headers) ] self._processed_pfasta_df = Utilities.merge_pd_series_list( processed_pfasta_headers).sort_values("protein_header") self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[ "vfdb_id"].str.zfill(zf_len) # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file), "VFs.xls") vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs", header=1).fillna("") vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill( zf_len) self.merged_df = pd.concat([ i.set_index("vfdb_id").sort_index() for i in [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df] ], axis=1, sort=False).sort_index() self.merged_df.index.names = ["vfdb_id"] self.merged_df = self.merged_df.loc[ self.merged_df["former_id"].str.len() > 0].reset_index() self.merged_df = Utilities.left_merge(self._raw_nfasta_df, self.merged_df, "former_id")

Exemplo n.º 2

0

Exibir arquivo

def annotate(self): # Process nucleotide FASTA self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)] self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Process protein FASTA raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in open(self._raw_pfasta_file, mode="r", encoding="utf-8") if i.startswith(">")] if len(j) > 0])) processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)] self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id") self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id", "description": "protein_description", "host": "protein_host"}, inplace=True) self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol") self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")

Exemplo n.º 3

0

Exibir arquivo

def query2fasta(self): output_dict = {"FASTAs_list": [], "annotations_series_list": []} for _soup in self._row_soups_list: _row_dict = self.parse_table_row(_soup) _locations_list = _row_dict["Sequence Location"].split("..") # Filtering expression if (self._gene.lower() in _row_dict["Name"].lower() or self._gene.lower() in _row_dict["Description"].lower() or self._gene.lower() in _row_dict["Aliases"].lower()) and len(_locations_list) == 2: fasta = self._get_fasta(_row_dict["Sequence ID"], _locations_list) output_dict["FASTAs_list"].append(fasta) output_dict["annotations_series_list"].append(pd.Series(Utilities.dict2pd_series(_row_dict), name=fasta.header)) return output_dict

Exemplo n.º 4

0

Exibir arquivo

def _mp_parse_nfasta_header(header): output_dict = dict(former_id=header) output_dict["genbank_id"] = Utilities.safe_findall( "^gb\|([^|]+)", header) output_dict["is_antisense_strand"] = header.split("|")[2].startswith( "-") output_dict["locus"] = Utilities.safe_findall("\|(\d+\-\d+)", header) output_dict["aro_id"] = Utilities.safe_findall("\|ARO:(\d+)", header) gene_chunk = header.split("|")[-1] output_dict["host"] = Utilities.safe_findall("\[(.+)\]", gene_chunk) output_dict["gene_description"] = gene_chunk.replace( "[{}]".format(output_dict["host"]), "").strip() _MIN_GENE_SYMBOL_LENGTH = 3 _NON_GENE_SYMBOL_WORDS = ("DNA", "RNA") output_dict["gene_symbol"] = min([ j for j in [ i.strip() for i in output_dict.get("gene_description").split(" ") ] if len(j) >= _MIN_GENE_SYMBOL_LENGTH and j not in _NON_GENE_SYMBOL_WORDS ], key=len) return Utilities.dict2pd_series(output_dict)