def parse_trnascan(trnascan_file, identifiers): """Parse tRNAscan results into a MultiArray.""" data = file_io.read_file(trnascan_file) lines = data.split('\n') header = True meta = {'file': trnascan_file} results = defaultdict(list) for line in lines: if header: row = re.split(' +', line) if len(row) > 1: if row[1].startswith('v.'): meta.update({'version': row[1]}) elif row[1] == 'Mode:': meta.update({'mode': row[2]}) meta.update({'field_id': "trnascan_%s" % row[2].lower()}) elif row[1].startswith('------'): header = False else: row = re.split(r' +|\t', line) if len(row) == 9: results[row[0]].append([row[4], row[5]]) if not identifiers.validate_list(list(results.keys())): raise UserWarning('Contig names in the tRNAScan file did not match dataset identifiers.') values = [results[id] if id in results else [] for id in identifiers.values] trnascan_field = MultiArray(meta['field_id'], values=values, meta=meta, headers=('tRNA_type', 'Anticodon'), parents=['children'] ) return trnascan_field
def parse_busco(busco_file, identifiers): # pylint: disable=too-many-locals """Parse BUSCO results into a MultiArray.""" data = file_io.read_file(busco_file) lines = data.split("\n") version = lines[0].split(":")[1].strip() desc = re.split(r":\s*|\(|\)\s*|,\s*", lines[1]) meta = { "version": version, "set": desc[1].strip(), "count": max(int(desc[5].strip()), int(desc[7].strip())), "file": busco_file, } version = int(version.split(".")[0]) if version < 4: rows = [re.split("\t", line) for line in lines[5:]] meta["set"] = re.search( r"-l\s.*?\/*(\w+_odb\d+)\/", lines[2].split(":")[1].strip() )[1] columns = re.split(r"# |\t", lines[4])[1:] try: contig_index = columns.index("Contig") except ValueError: contig_index = columns.index("Sequence") else: rows = [re.split("\t", line) for line in lines[3:]] columns = re.split(r"# |\t", lines[2])[1:] contig_index = columns.index("Sequence") meta["field_id"] = "%s_busco" % meta["set"] busco_index = columns.index("Busco id") status_index = columns.index("Status") results = defaultdict(list) for row in rows: if len(row) > contig_index: if version < 4: contig = row[contig_index] else: contig = row[contig_index].split(":")[0] results[contig].append([row[busco_index], row[status_index]]) if not identifiers.validate_list(list(results.keys())): raise UserWarning( "Contig names in the Busco file did not match dataset identifiers." ) values = [results[id] if id in results else [] for id in identifiers.values] busco_field = MultiArray( meta["field_id"], values=values, meta=meta, headers=("Busco id", "Status"), parents=["children"], category_slot=1, ) return busco_field
def parse_busco(busco_file, identifiers): """Parse BUSCO results into a MultiArray.""" data = file_io.read_file(busco_file) lines = data.split('\n') rows = [re.split('\t', line) for line in lines[5:]] meta = { 'version': lines[0].split(':')[1].strip(), 'set': re.split(r':|\(|\)', lines[1])[1].strip(), 'count': int(re.split(r':|\(|\)', lines[1])[5].strip()), 'command': lines[2].split(':')[1].strip(), 'file': busco_file } meta['set'] = re.search(r'-l\s.*?\/*(\w+_odb\d+)\/', meta['command'])[1] meta['field_id'] = "%s_busco" % meta['set'] columns = re.split(r'# |\t', lines[4])[1:] busco_index = columns.index('Busco id') status_index = columns.index('Status') contig_index = columns.index('Contig') results = defaultdict(list) for row in rows: if len(row) > contig_index: results[row[contig_index]].append( [row[busco_index], row[status_index]]) if not identifiers.validate_list(list(results.keys())): raise UserWarning( 'Contig names in the Busco file did not match dataset identifiers.' ) values = [ results[id] if id in results else [] for id in identifiers.values ] busco_field = MultiArray(meta['field_id'], values=values, meta=meta, headers=('Busco id', 'Status'), parents=['children'], category_slot=1) return busco_field
def parse(file, **kwargs): """Parse all synonym files.""" blob_db = file_io.load_yaml(file) kwargs['meta'].assembly.update({'file': blob_db['assembly_f']}) parsed = [] identifiers = kwargs['dependencies']['identifiers'] if not identifiers: identifiers = Identifier('identifiers', meta={'field_id': 'identifiers'}, values=blob_db['order_of_blobs'], parents=[]) kwargs['meta'].assembly.update( {'scaffold-count': len(identifiers.values)}) parsed.append(identifiers) values = values_from_blob_db(blob_db) kwargs['meta'].assembly.update({'span': sum(values['lengths'])}) parsed.append( Variable('gc', meta={ 'preload': True, 'scale': 'scaleLinear', 'field_id': 'gc', 'name': 'GC', 'datatype': 'float', 'range': [min(values['gcs']), max(values['gcs'])] }, values=values['gcs'], parents=[])) _min = min(values['lengths']) parsed.append( Variable('length', meta={ 'field_id': 'length', 'preload': True, 'scale': 'scaleLog', 'name': 'Length', 'clamp': 100 if _min == 0 else False, 'datatype': 'integer', 'range': [_min, max(values['lengths'])] }, parents=[], values=values['lengths'])) parsed.append( Variable('ncount', meta={ 'field_id': 'ncount', 'scale': 'scaleLinear', 'name': 'N count', 'datatype': 'integer', 'range': [min(values['n_counts']), max(values['n_counts'])] }, values=values['n_counts'], parents=[])) if 'z' not in kwargs['meta'].plot: kwargs['meta'].plot.update({'z': 'length'}) if 'x' not in kwargs['meta'].plot: kwargs['meta'].plot.update({'x': 'gc'}) cov_range = [math.inf, -math.inf] read_cov_range = [math.inf, -math.inf] for cov_lib, cov_meta in blob_db['covLibs'].items(): cov_file_name = field_name_from_path(blob_db['covLibs'][cov_lib]['f']) covs = values["%s_cov" % cov_lib] read_covs = values["%s_read_cov" % cov_lib] cov_range = [min(covs + [cov_range[0]]), max(covs + [cov_range[1]])] read_cov_range = [ min(read_covs + [read_cov_range[0]]), max(read_covs + [read_cov_range[1]]) ] if 'y' not in kwargs['meta'].plot: kwargs['meta'].plot.update({'y': "%s_cov" % cov_file_name}) parsed.append( Variable("%s_cov" % cov_file_name, values=covs, meta={ 'field_id': "%s_cov" % cov_file_name, 'file': cov_meta['f'] }, parents=cov.parent() + [ 'children', { 'id': 'base_coverage', 'clamp': 1 if cov_range[0] == 0 else False, 'range': cov_range }, 'children' ])) parsed.append( Variable("%s_read_cov" % cov_file_name, values=read_covs, meta={ 'field_id': "%s_read_cov" % cov_file_name, 'file': cov_meta['f'], 'reads_mapped': cov_meta['reads_mapped'], 'reads_unmapped': cov_meta['reads_unmapped'] }, parents=cov.parent() + [ 'children', { 'id': 'read_coverage', 'datatype': 'integer', 'clamp': 1 if read_cov_range[0] == 0 else False, 'range': read_cov_range }, 'children' ])) ranks = blob_db['dict_of_blobs'][identifiers.values[0]]['taxonomy'][ blob_db['taxrules'][0]].keys() for tax_rule in blob_db['taxrules']: if 'cat' not in kwargs['meta'].plot: kwargs['meta'].plot.update({'cat': "%s_phylum" % tax_rule}) hit_list = hits_from_blob_db(blob_db, tax_rule) parsed.append( MultiArray("%s_hits" % tax_rule, values=hit_list, meta={ 'field_id': "%s_hits" % tax_rule, 'type': 'multiarray', 'datatype': 'mixed', 'preload': False, 'active': False, 'files': [m['f'] for x, m in blob_db['hitLibs'].items()] }, parents=hits.parent() + ['children', { 'id': tax_rule }, 'children'], category_slot=None, headers=['taxid', 'score'])) for rank in ranks: field_id = "%s_%s" % (tax_rule, rank) parsed.append( Category(field_id, values=values[field_id], meta={'field_id': field_id}, parents=hits.parent() + ['children', { 'id': tax_rule }, 'children'])) parents = hits.parent() + [ 'children', { 'id': tax_rule }, 'children', { 'id': field_id }, 'data' ] field_id = "%s_%s_cindex" % (tax_rule, rank) parsed.append( Variable(field_id, values=values[field_id], meta={ 'scale': 'scaleLinear', 'field_id': field_id, 'datatype': 'integer', 'range': [min(values[field_id]), max(values[field_id])], 'preload': False, 'active': False }, parents=parents)) field_id = "%s_%s_score" % (tax_rule, rank) _min = min(values[field_id]) parsed.append( Variable(field_id, values=values[field_id], meta={ 'scale': 'scaleLog', 'field_id': field_id, 'clamp': 1 if _min == 0 else False, 'datatype': 'float', 'range': [_min, max(values[field_id])], 'preload': False, 'active': False }, parents=parents)) return parsed
def create_fields(results, taxrule, files, fields=None): """Store BLAST results as Fields.""" if fields is None: fields = [] hits_id = "%s_%s" % (taxrule, "positions") fields.append( MultiArray( hits_id, values=results[0]["data"]["hits"], meta={ "field_id": hits_id, "name": hits_id, "type": "multiarray", "datatype": "mixed", "preload": False, "active": False, "files": files, }, parents=["children", { "id": taxrule }, "children"], category_slot=None, headers=[ "taxid", "start", "end", "score", "subject", "index", "title" ], )) for result in results: main = Category( result["field_id"], values=result["values"], meta={ "field_id": result["field_id"], "name": result["field_id"] }, parents=["children", { "id": taxrule }, "children"], ) fields.append(main) parents = [ "children", { "id": taxrule }, "children", { "id": result["field_id"] }, "data", ] field_id = "%s_%s" % (result["field_id"], "cindex") fields.append( Variable( field_id, values=result["data"]["cindex"], meta={ "scale": "scaleLinear", "field_id": field_id, "name": field_id, "datatype": "integer", "range": [ min(result["data"]["cindex"]), max(result["data"]["cindex"]), ], "preload": False, "active": False, }, parents=parents, )) field_id = "%s_%s" % (result["field_id"], "score") _min = min(result["data"]["score"]) fields.append( Variable( field_id, values=result["data"]["score"], meta={ "scale": "scaleLog", "field_id": field_id, "name": field_id, "clamp": 1 if _min == 0 else False, "datatype": "float", "range": [_min, max(result["data"]["score"])], "preload": False, "active": False, }, parents=parents, )) subfield = "positions" field_id = "%s_%s" % (result["field_id"], subfield) if len(result["data"][subfield]) > 1: headers = ["name"] else: headers = ["name"] fields.append( MultiArray( field_id, values=result["data"][subfield], fixed_keys=main.keys, meta={ "field_id": field_id, "name": field_id, "type": "multiarray", "datatype": "string", "preload": False, "active": False, "linked_field": hits_id, }, parents=parents, category_slot=0, headers=headers, )) for subfield in result["data"].keys(): if subfield.startswith("windows"): field_id = "%s_%s" % (result["field_id"], subfield) if len(result["data"][subfield]) > 1: headers = ["name"] else: headers = ["name"] fields.append( MultiArray( field_id, values=result["data"][subfield], fixed_keys=main.keys, meta={ "field_id": field_id, "name": field_id, "type": "array", "datatype": "string", "preload": False, "active": False, }, parents=parents, category_slot=0, headers=headers, )) return fields
def parse(files, **kwargs): if "--bedtsvdir" in kwargs or "--bedtsvdir" in kwargs: if isinstance(files, str) and path.isdir(files): print("Reading all TSV files in %s" % files) files = glob("%s/*.tsv" % files) filename, all_windows, full = parse_tsvfiles(files) filenames = {"all": filename} else: if isinstance(files, str) and path.isdir(files): print("Reading all BED files in %s" % files) files = glob("%s/*.bed" % files) filenames, all_windows, full = parse_bedfiles(files) full_n = {} full_sd = {} if isinstance(full, dict): full_sd = full["sd"] full_n = full["n"] full = full["values"] all_windows_n = {} all_windows_sd = {} if isinstance(all_windows, dict): all_windows_n = all_windows["n"] all_windows_sd = all_windows["sd"] all_windows = all_windows["values"] parsed = [] settings = field_settings() identifiers = kwargs["dependencies"]["identifiers"] keys = [] if "length" in full: keys = list(full["length"].keys()) lengths = list(full["length"].values()) kwargs["meta"].assembly.update({"span": sum(lengths)}) if "z" not in kwargs["meta"].plot: kwargs["meta"].plot.update({"z": "length"}) if "gc" in full and "x" not in kwargs["meta"].plot: kwargs["meta"].plot.update({"x": "gc"}) if not identifiers: if not keys: print("ERROR: Unable to set identifiers") sys.exit(1) identifiers = Identifier( "identifiers", meta={"field_id": "identifiers"}, values=keys, parents=[], ) kwargs["meta"].assembly.update({"scaffold-count": len(identifiers.values)}) parsed.append(identifiers) ranges = { key: {"range": [math.inf, -math.inf], "meta": {}} for key in settings.keys() } for field, data in full.items(): filename = filenames.get(field, filenames.get("all", "")) if data: values = [] for seq_id in identifiers.values: values.append(data[seq_id] if seq_id in data else 0) if values: meta = {} parents = [] suffix = field.split("_")[-1] if suffix in settings: meta = deepcopy(settings[suffix]["meta"]) meta.update( { "field_id": field, "file": filename, } ) if meta["datatype"] == "integer": values = [int(value) for value in values] value_range = [min(values), max(values)] if "clamp" in meta and value_range[0] >= meta["clamp"]: meta["clamp"] = False parent_range = False if "parents" in settings[suffix]: parents = settings[suffix]["parents"] for parent in parents: if "range" in parent: parent_range = True parent["range"][0] = min( parent["range"][0], value_range[0] ) parent["range"][1] = max( parent["range"][1], value_range[1] ) if not parent_range: if "range" in meta: meta["range"][0] = min(meta["range"][0], value_range[0]) meta["range"][1] = max(meta["range"][1], value_range[1]) else: meta["range"] = value_range if meta["range"][1] <= meta["range"][0]: continue if "preload" in meta and meta["preload"] == 1: if value_range[1] > ranges[suffix]["range"][1]: meta["preload"] = True if "plot_axis" in settings[suffix]: kwargs["meta"].plot.update( {settings[suffix]["plot_axis"]: field} ) if "preload" in ranges[suffix]["meta"]: ranges[suffix]["meta"]["preload"] = False ranges[suffix].update({"range": value_range, "meta": meta}) else: meta["preload"] = False if field.endswith("_%s" % suffix): meta["name"] = "%s %s" % ( field.replace("_%s" % suffix, ""), meta["name"], ) parsed.append( Variable( field, meta=meta, values=values, parents=parents, ) ) if field in full_sd: stats_values = [] for seq_id in identifiers.values: values = ( [full_sd[field][seq_id], full_n[field][seq_id]] if seq_id in data else [] ) stats_values.append(values) parsed.append( Array( "%s_stats" % field, meta={ "field_id": "%s_stats" % field, "name": "%s stats" % meta["name"], "type": "array", "datatype": "mixed", }, values=stats_values, parents=parents, headers=["sd", "n"], ) ) for window, windows in all_windows.items(): windows_sd = all_windows_sd.get(window, {}) windows_n = all_windows_n.get(window, {}) if field in windows: window_values = [] headers = [field] if field in windows_sd: headers += ["sd", "n"] for seq_id in identifiers.values: seq_values = [] if seq_id in data: for idx, value in enumerate(windows[field][seq_id]): if meta["datatype"] == "integer": value = int(value) if field in windows_sd: value = [ value, windows_sd[field][seq_id][idx], windows_n[field][seq_id][idx], ] else: value = [value] seq_values.append(value) window_values.append(seq_values) windows_field = "%s_windows" % field if str(window) != "0.1": windows_field += "_%s" % str(window) parsed.append( MultiArray( windows_field, meta={ "field_id": windows_field, "name": "%s windows %s" % (meta["name"], window), "type": "multiarray", "datatype": "mixed", }, values=window_values, parents=parents, headers=headers, ) ) return parsed
def create_fields(results, taxrule, files, fields=None): """Store BLAST results as Fields.""" if fields is None: fields = [] hits_id = "%s_%s" % (taxrule, 'positions') fields.append( MultiArray( hits_id, values=results[0]['data']['hits'], meta={ 'field_id': hits_id, 'name': hits_id, 'type': 'multiarray', 'datatype': 'mixed', 'preload': False, 'active': False, 'files': files }, parents=['children', { 'id': taxrule }, 'children'], category_slot=None, headers=['taxid', 'start', 'end', 'score', 'subject', 'index'])) for result in results: main = Category(result['field_id'], values=result['values'], meta={ 'field_id': result['field_id'], 'name': result['field_id'] }, parents=['children', { 'id': taxrule }, 'children']) fields.append(main) parents = [ 'children', { 'id': taxrule }, 'children', { 'id': result['field_id'] }, 'data' ] field_id = "%s_%s" % (result['field_id'], 'cindex') fields.append( Variable(field_id, values=result['data']['cindex'], meta={ 'scale': 'scaleLinear', 'field_id': field_id, 'name': field_id, 'datatype': 'integer', 'range': [ min(result['data']['cindex']), max(result['data']['cindex']) ], 'preload': False, 'active': False }, parents=parents)) field_id = "%s_%s" % (result['field_id'], 'score') _min = min(result['data']['score']) fields.append( Variable(field_id, values=result['data']['score'], meta={ 'scale': 'scaleLog', 'field_id': field_id, 'name': field_id, 'clamp': 1 if _min == 0 else False, 'datatype': 'float', 'range': [_min, max(result['data']['score'])], 'preload': False, 'active': False }, parents=parents)) subfield = 'positions' field_id = "%s_%s" % (result['field_id'], subfield) if len(result['data'][subfield]) > 1: headers = ['name'] else: headers = ['name'] fields.append( MultiArray(field_id, values=result['data'][subfield], fixed_keys=main.keys, meta={ 'field_id': field_id, 'name': field_id, 'type': 'multiarray', 'datatype': 'string', 'preload': False, 'active': False, 'linked_field': hits_id }, parents=parents, category_slot=0, headers=headers)) return fields