def load_names(self): """Load names from file.""" filename = os.path.abspath(os.path.join(self.directory, 'names.dmp')) for line in file_io.stream_file(filename): row = self.parse_taxdump_row(line) if row and row[3] == 'scientific name': self.names[int(row[0])] = row[1]
def parse_blast(blast_file, results=None, index=0): """Parse file into dict of lists.""" if results is None: results = defaultdict(list) for line in file_io.stream_file(blast_file): row = line.rstrip().split('\t') seq_id, *offset = row[0].split('_-_') offset = int(offset[0]) if offset else 0 try: hit = { 'subject': row[4], 'score': float(row[2]), 'start': int(row[9]) + offset, 'end': int(row[10]) + offset, 'file': index } except IndexError: hit = { 'subject': row[3], 'score': float(row[2]), 'start': None, 'end': None, 'file': index } try: hit.update({'taxid': int(row[1])}) except ValueError: hit.update({'taxid': 0}) results[seq_id].append(hit) return results
def load_ranks(self): """Load ranks from file.""" filename = os.path.abspath(os.path.join(self.directory, 'nodes.dmp')) try: for line in file_io.stream_file(filename): row = self.parse_taxdump_row(line) if len(row) > 1: self.ranks[int(row[0])] = row[2] except TypeError: print("ERROR: Unable to parse %s." % filename) exit(1)
def parse_synonyms(synonym_file, delimiter, columns, header, identifiers): """Parse synonyms into Array.""" meta = {} synonym_file, *prefix = synonym_file.split("=") if prefix: prefix = prefix[0] else: prefix = Path(synonym_file).stem meta["field_id"] = "%s_synonyms" % prefix by_id = {} ids = identifiers.to_set() data = file_io.stream_file(synonym_file) lines = [line for line in data] if columns: columns = columns.split(",") else: columns = [] delimit = set_delimiter(delimiter, sample=lines[0]) if header: header_row = lines[0].rstrip().replace('"', "") columns = parse_header_row(delimit, header_row, columns) lines = lines[1:] try: id_col = columns.index("identifier") except ValueError: id_col = None for line in lines: row = re.split(delimit, line.rstrip().replace('"', "")) key = None names = [] for i, value in enumerate(row): if id_col is None and value in ids: key = value id_col = i elif i == id_col: key = value else: names.append(value) by_id.update({key: names}) values = [by_id[id] if id in by_id else [] for id in identifiers.values] del columns[id_col] synonyms_field = Array( meta["field_id"], meta=meta, values=values, headers=columns, parents=["children"], ) return synonyms_field
def parse_blast(blast_file, cols, results=None, index=0, evalue=1, bitscore=1): """Parse file into dict of lists.""" if results is None: results = defaultdict(list) for line in file_io.stream_file(blast_file): row = line.rstrip().split('\t') score = float(row[cols['bitscore']]) if score < bitscore: continue if len(row) == 4: cols['sseqid'] = 3 else: if evalue < float(row[cols['evalue']]): continue seq_id, *offset = row[cols['qseqid']].split('_-_') offset = int(offset[0]) if offset else 0 try: hit = { 'subject': row[cols['sseqid']], 'score': score, 'start': int(row[cols['sstart']]) + offset, 'end': int(row[cols['send']]) + offset, 'file': index } except IndexError: hit = { 'subject': row[cols['sseqid']], 'score': score, 'start': None, 'end': None, 'file': index } try: taxid = row[cols['staxids']] try: taxid, *rest = taxid.split(';') except ValueError: pass hit.update({'taxid': int(taxid)}) except ValueError: hit.update({'taxid': 0}) results[seq_id].append(hit) return results
def load_ancestors(self): """Load ancestors from file.""" filename = os.path.abspath(os.path.join(self.directory, 'taxidlineage.dmp')) for line in file_io.stream_file(filename): row = self.parse_taxdump_row(line) if row[1]: taxid = int(row[0]) self.ancestors[taxid] = { self.ranks[int(id)]: int(id) for id in row[1].split(' ') if self.ranks[int(id)] in self.list_ranks() } if self.ranks[taxid] in self.list_ranks(): self.ancestors[taxid].update({self.ranks[taxid]: taxid}) last = 0 for rank in self.list_ranks(): if rank in self.ancestors[taxid]: last = -self.ancestors[taxid][rank] else: self.ancestors[taxid].update({rank: last})
def parse_synonyms(synonym_file, identifiers): """Parse synonyms into Array.""" meta = {} file_stem = Path(synonym_file).stem meta['field_id'] = "%s_synonyms" % file_stem by_id = {} ids = identifiers.to_set() for line in file_io.stream_file(synonym_file): row = line.rstrip().replace('"', '').split('\t') key = None names = [] for value in row: if value in ids: key = value else: names.append(value) by_id.update({key: names}) values = [by_id[id] if id in by_id else [] for id in identifiers.values] synonyms_field = Array(meta['field_id'], meta=meta, values=values, parents=['children']) return synonyms_field
def parse_blast(blast_file, cols, results=None, index=0, evalue=1, bitscore=1): """Parse file into dict of lists.""" if results is None: results = defaultdict(list) bitscores = {} blastp = {} for line in file_io.stream_file(blast_file): row = line.rstrip().split("\t") score = float(row[cols["bitscore"]]) if score < bitscore: continue if len(row) == 4: cols["sseqid"] = 3 else: if evalue < float(row[cols["evalue"]]): continue # allow for mis-specified columns following documentation bug if "sstart" in cols and "qstart" not in cols: cols["qstart"] = cols["sstart"] if "send" in cols and "qend" not in cols: cols["qend"] = cols["send"] seq_id, *offset = row[cols["qseqid"]].split("_-_") offset = int(offset[0]) if offset else 0 query = row[cols["qseqid"]] if ":" in query and "=" in query: # parse blastp parts = query.split("=") if query in bitscores and score <= bitscores[query]: continue if len(parts) == 3 and parts[2] == "fragmented": continue bitscores[query] = score seq_id, start, end = re.split(r"[:-]", parts[0]) hit = { "subject": row[cols["sseqid"]], "score": score, "start": int(start), "end": int(end), "file": index, "title": parts[1], } else: # parse blastx/blastn try: hit = { "subject": row[cols["sseqid"]], "score": score, "start": int(row[cols["qstart"]]) + offset, "end": int(row[cols["qend"]]) + offset, "file": index, } except IndexError: # parse file without positions hit = { "subject": row[cols["sseqid"]], "score": score, "start": None, "end": None, "file": index, } try: taxid = row[cols["staxids"]] try: taxid, *rest = taxid.split(";") except ValueError: # no taxid for this row pass hit.update({"taxid": int(taxid)}) except ValueError: # no taxid in file hit.update({"taxid": 0}) if bitscores: blastp[query] = hit else: results[seq_id].append(hit) if bitscores: for query, hit in blastp.items(): seq_id, rest = query.split(":") results[seq_id].append(hit) return results