def construct_graph(seq_dict, match_dict, threshold=90): uf = UnionFind(seq_dict) component_dict = dict() for match in match_dict.values(): q_seq = seq_dict[match.q_name] r_seq = seq_dict[match.r_name] if match.q_global_identity > threshold or match.r_global_identity > threshold: uf.union(q_seq.name, r_seq.name) uf.rename_component() for seq_name in seq_dict.keys(): seq = seq_dict[seq_name] component_label = uf.component_label[seq_name] component_size = uf.component_size[component_label] seq.label['component'] = component_label component = Component(component_label) component.add_member(seq) if component_label in component_dict: component_dict[component_label].add_member(seq) else: component_dict[component_label] = component return uf, component_dict
def gene_isoform_analysis(ref_gtf, seq_dict): gene_dict = dict() transcript_parent = dict() ref_gtf = pd.read_table(ref_gtf, sep='\t', header=None, low_memory=False) ref_gtf.columns = [ 'chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'header' ] for i, data in ref_gtf.iterrows(): regex = re.match('transcript_id "(\S+)"; gene_id "(\S+)"', data['header']) if regex: transcript_name = regex.group(1) gene_name = regex.group(2) seq = seq_dict[transcript_name] seq.label['gene'] = gene_name gene = Component(gene_name) gene.add_member(seq) if gene_name in gene_dict: if seq not in gene_dict[gene_name].member: gene_dict[gene_name].add_member(seq) else: gene_dict[gene_name] = gene return gene_dict