def convert_xmfa_to_gff3(xmfa_file, fasta_genomes, window_size=3, relative_to="1"): label_convert = _id_tn_dict(fasta_genomes) try: os.makedirs("out") except Exception: pass for lcb_idx, lcb in enumerate(parse_xmfa(xmfa_file)): ids = [seq["id"] for seq in lcb] # Doesn't match part of our sequence if relative_to not in ids: continue # Skip sequences that are JUST our "relative_to" genome if len(ids) == 1: continue parent = [seq for seq in lcb if seq["id"] == relative_to][0] others = [seq for seq in lcb if seq["id"] != relative_to] if parent["start"] == 0 and parent["end"] == 0: continue corrected_parent, corrected_targets = remove_gaps( parent["seq"], [other["seq"] for other in others]) # Update the parent/others with corrected sequences parent["corrected"] = corrected_parent for i, target in enumerate(corrected_targets): others[i]["corrected"] = target for i in range(1, len(corrected_parent) - 1): for other in others: left_bound = max(0, i - window_size) right_bound = i + window_size point_pid = percent_identity( parent["corrected"][left_bound:right_bound], other["corrected"][left_bound:right_bound], ) label_convert[other["id"]]["temp"].write( "%s\t%s\n" % (abs(parent["start"]) + i, point_pid)) for key in label_convert.keys(): # Ignore self-self if key == relative_to: continue other = label_convert[key] other["temp"].close() sizes = [(label_convert[relative_to]["record_id"], label_convert[relative_to]["len"])] bw_file = os.path.join("out", secure_filename(other["record_id"] + ".bigwig")) convert_to_bigwig(label_convert[key]["temp"].name, sizes, bw_file)
def convert_xmfa_to_gff3(xmfa_file, sequences=None, window_size=1000, protein=False): label_convert = id_tn_dict(sequences) lcbs = parse_xmfa(xmfa_file) parent_records = { x: SeqRecord( Seq("ACTG", IUPAC.IUPACUnambiguousDNA), id=label_convert[x]["record_id"] ) for x in label_convert.keys() } for lcb_idx, lcb in enumerate(lcbs): ids = [seq["id"] for seq in lcb] # Skip sequences that are JUST a single genome if len(ids) == 1: continue for parent_id in ids: parent = [seq for seq in lcb if seq["id"] == parent_id][0] others = [seq for seq in lcb if seq["id"] != parent_id] if parent["start"] == 0 and parent["end"] == 0: continue for o_idx, other in enumerate(others): # A feature representing a region of synteny between parent and the given other other_feature = SeqFeature( FeatureLocation(parent["start"], parent["end"]), type="match", strand=parent["strand"], qualifiers={ "source": "progressiveMauve", "Target": label_convert[other["id"]]["record_id"], "Target_protein": other["comment"], "ID": "m_%s_%s_%s_%s" % ( lcb_idx, o_idx, label_convert[parent["id"]]["record_id"], label_convert[other["id"]]["record_id"], ), "Name": other["comment"], }, ) other_feature.sub_features = [] subs = generate_subfeatures(parent, window_size, other, protein=protein) for subfeature in reduce_subfeatures( sorted(subs, key=lambda x: x.location.start) ): other_feature.sub_features.append(subfeature) parent_records[parent["id"]].features.append(other_feature) for i in parent_records: yield parent_records[i]
def total_similarity(xmfa_file, sequences=None, dice=False): if sequences is None: raise Exception("Must provide a non-zero number of sequence files") label_convert = _id_tn_dict(sequences) lcbs = parse_xmfa(xmfa_file) # make a matrix based on number of sequences table = {} for lcb in lcbs: # ignore LCBs containing only one sequence if len(lcb) == 0: continue # permutations based on num sequences to compare for current LCB compare_seqs = list(itertools.permutations(range(0, len(lcb)), 2)) for permutation in compare_seqs: (i, j) = permutation similarity = percent_identity(lcb[i]['seq'], lcb[j]['seq']) i_name = label_convert[lcb[i]['id']]['id'] j_name = label_convert[lcb[j]['id']]['id'] # find length of sequence in LCB length_seq_lcb = lcb[i]['end'] - (lcb[i]['start'] - 1) # populate table with normalized similarity value based on length_seq_lcb if (i_name, j_name) not in table: table[(i_name, j_name)] = 0 table[(i_name, j_name)] += length_seq_lcb * similarity # finalize total percent similarity by dividing by length of parent sequence for i in label_convert.keys(): for j in label_convert.keys(): i_name = label_convert[i]['id'] j_name = label_convert[j]['id'] if (i_name, j_name) in table: if dice: table[(i_name, j_name)] = 2 * table[(i_name, j_name)] / ( label_convert[i]['len'] + label_convert[j]['len']) else: table[(i_name, j_name)] = table[(i_name, j_name)] / label_convert[i]['len'] else: table[(i_name, j_name)] = 0 if i_name == j_name: table[(i_name, j_name)] = 100 # print table names = [] table_keys = sorted(label_convert.keys()) for i in table_keys: names.append(label_convert[i]['id']) sys.stdout.write('\t' + '\t'.join(names) + '\n') for j in table_keys: j_key = label_convert[j]['id'] sys.stdout.write(j_key) for i in table_keys: i_key = label_convert[i]['id'] sys.stdout.write('\t%0.2f' % table[(i_key, j_key)]) sys.stdout.write('\n')
return new_lcbs if __name__ == '__main__': parser = argparse.ArgumentParser(description='Split XMFA alignments', prog='xmfa2smallerXmfa') parser.add_argument('xmfa_file', type=argparse.FileType("r"), help='XMFA File') parser.add_argument('--window_size', type=int, help='Window size for analysis', default=10) parser.add_argument('--threshold', type=float, help='All genomes must meet N percent similarity', default=0.7) args = parser.parse_args() # Write xmfa.to_xmfa( # Split split_lcbs( # Parse xmfa.parse_xmfa(args.xmfa_file), window_size=args.window_size, threshold=args.threshold, ))