def read_ipr(io_buffer, whitelist=None): """Returns a list of lists, each containing mrna_id, "Dbxref" and annotation.""" ipr_list = [] for line in io_buffer: columns = line.split("\t") #columns are assumed to be tab-separated #if column exists and dbxref is in whitelist (aside from whitespace padding and caps) if (len(columns)>3 and (columns[3].strip().lower() in whitelist)) or\ (len(columns)>3 and not whitelist): ipr_list.append( Annotation( columns[0].strip(), "Dbxref", columns[3].strip().upper() + ":" + columns[4].strip())) #if column exists (we don't care about the whitelist for GO annotations) if len(columns) > 13 and columns[13].find("GO:") != -1: ipr_list.append( Annotation(columns[0].strip(), "Dbxref", columns[13].strip())) #if column exists (we don't care about the whitelist for IPR annotations) if len(columns) > 11 and columns[11].find("IPR") != -1: ipr_list.append( Annotation(columns[0].strip(), "Dbxref", "InterPro:" + columns[11].strip())) #this alg removes duplicates ipr_list = sorted(ipr_list) ipr_list = [ ipr_list[i] for i in range(len(ipr_list)) if i == 0 or ipr_list[i] != ipr_list[i - 1] ] return ipr_list
def read_sprot(blast_file, gff_file, fasta_file): #retrieve relevant information from files fasta_info = get_fasta_info(fasta_file) gff_info = get_gff_info(gff_file) blast_info = get_blast_info(blast_file) sprot_list = [] for mrna, dbxref in blast_info.items(): #blast_info maps mrna's to dbxrefs if dbxref not in fasta_info: #these two if's shouldn't occur but just in case... print(mrna + " has dbxref " + dbxref + " that's not in the fasta. Skipping...") continue if mrna not in gff_info: print(mrna + " not in gff. Skipping...") continue #fasta_info maps dbxrefs to products and names product = fasta_info[dbxref][0] gene_name = fasta_info[dbxref][1] #gff_info maps mrna's to the parent gene id's gene_id = gff_info[mrna] #add annotations to annotation list sprot_list.append(Annotation(gene_id, "name", gene_name)) sprot_list.append(Annotation(mrna, "product", product)) return sprot_list
def test_read_sprot_missing_gene_name(self): self.fasta_file = io.StringIO(\ '>sp|Q5AZY1|MRH4_EMENI ATP-dependent RNA helicase mrh4, mitochondrial OS=Emericella nidulans (strain FGSC A4 / ATCC 38163 / CBS 112.46 / NRRL 194 / M139) PE=3 SV=1\n\ MNRLGGLSLPLRPVCLFCRAQTSLALSPLQGGQAVRSIATGRLRRRARMTLSKDVAKSSL\n\ KPKRTDRGKLGPFPNMNQTRARVREDPRSRSPAALKRSGETEEKPAMNTESPLYKALKMQ\n\ TALAPISYGKRTAIKAKIAEITSFDAFTLLPIVRNSIFSQALPGIADAVPTPIQRVAIPR\n\ LLEDAPAKKQAKKVDDDEPQYEQYLLAAETGSGKTLAYLIPVIDAIKRQEIQEKEMEKKE\n\ EERKVREREENKKNQAFDLEPEIPPPSNAGRPRAIILVPTAELVAQVGAKLKAFAHTVKF\n\ RSGIISSNLTPRRIKSTLFNPAGIDILVSTPHLLASIAKTDPYVLSRVSHLVLDEADSLM\n\ DRSFLPISTEVISKAAPSLQKLIFCSATIPRSLDSQLRKLYPDIWRLTTPNLHAIPRRVQ\n\ LGVVDIQKDPYRGNRNLACADVIWSIGKSGAGSDEAGSPWSEPKTKKILVFVNEREEADE\n\ VAQFLKSKGIDAHSFNRDSGTRKQEEILAEFTEPAAVPTAEEILLARKQQQRENINIPFV\n\ LPERTNRDTERRLDGVKVLVTTDIASRGIDTLALKTVILYHVPHTTIDFIHRLGRLGRMG\n\ KRGRAVVLVGKKDRKDVVKEVREVWFGLDS') sprot_list = read_sprot(self.blast_file, self.gff_file, self.fasta_file) expected = [Annotation("g.4830", "name", "MRH4"), Annotation("m.4830", "product", "ATP-dependent RNA helicase mrh4, mitochondrial")] self.assertEquals(sprot_list, expected)
def build_annotations(self, subcorpus, lu, frame): """ Builds annotations from a subcorpus. """ name = subcorpus.attrib['name'] annotations = [] for child in subcorpus.getchildren(): if "metaphor" in child.attrib.keys(): print(child.attrib) for c2 in child.getchildren(): tag = c2.tag.replace(self.replace_tag, "") if tag == "text": sentence = c2.text #print(sentence) #print("\n") elif tag == "annotationSet": if len(c2.attrib.keys()) > 3: print(c2.attrib) status = c2.attrib['status'] ID = int(c2.attrib['ID']) if status in ["MANUAL", "AUTO_EDITED"]: new = Annotation(ID, status, sentence, name, lu, frame) for c3 in c2.getchildren(): tag = c3.tag.replace(self.replace_tag, "") if c3.attrib['name'] == "FE": for c4 in c3.getchildren(): tag = c4.tag.replace(self.replace_tag, "") name = c4.attrib[ 'name'] #.encode('utf-8') # Encode it, otherwise it breaks on Windows if 'start' and 'end' in c4.attrib: start, end = int( c4.attrib['start']), int( c4.attrib['end']) raw_text = new.sentence[ start:end + 1].encode('utf-8').decode('utf-8') new.add_fe_mapping(name, raw_text) new.set_spans(name, (start, end)) else: new.add_fe_mapping( name, c4.attrib['itype']) elif c3.attrib['name'] == "Sent": for c4 in c3.getchildren(): tag = c4.tag.replace(self.replace_tag, "") if c4.attrib['name'] == "Metaphor": new.set_metaphor() elif c3.attrib['name'] == "Target": for c4 in c3.getchildren(): if c4.attrib['name'] == "Target": start, end = int( c4.attrib['start']), int( c4.attrib['end']) new.set_target(new.sentence[start:end + 1]) annotations.append(new) #print(child.tag) return annotations
def test_to_sequence(self): annotation = Annotation(["x", "y"], {}) "func(x, y)" sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE ] _, input = to_decoder_input(sequence, annotation, grammar) sequence2 = to_sequence(input, annotation, grammar) self.assertEqual(sequence2, sequence)
def find_objects_in_segmented_frames(frames: List[Tuple[int, np.array]], box_min_size: Tuple) -> List[Annotation]: annnotations = [] for i, (frame_idx, frame) in enumerate(frames): contours, _ = cv2.findContours(frame, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) for contour in contours: left, top, width, height = cv2.boundingRect(contour) if width > box_min_size[0] and height > box_min_size[1]: annnotations.append( Annotation( frame=frame_idx, left=left, top=top, width=left + width, height=top + height, label='car', )) return annnotations
def test_to_decoder_input(self): annotation = Annotation(["x", "y"], {}) "func(x, y)" sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE ] next_node_type, input = to_decoder_input(sequence, annotation, grammar) self.assertEqual(next_node_type, None) self.assertEqual( input.action.astype(np.int32).tolist(), [[0, 0, 0], [1, 0, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 0, 0], [0, 2, 0], [4, 0, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 1, 0], [0, 2, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 0, 1], [0, 2, 0]]) self.assertEqual( input.action_type[:, 0].astype(np.int32).tolist(), [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0]) self.assertEqual( input.action_type[:, 1].astype(np.int32).tolist(), [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1]) self.assertEqual( input.action_type[:, 2].astype(np.int32).tolist(), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]) self.assertEqual( input.node_type.astype(np.int32).tolist(), [0, 1, 2, 4, 5, 6, 6, 3, 2, 4, 5, 6, 6, 2, 4, 5, 6, 6]) self.assertEqual( input.parent_action.astype(np.int32).tolist(), [0, 0, 1, 5, 2, 3, 3, 1, 4, 5, 2, 3, 3, 4, 5, 2, 3, 3]) self.assertEqual( input.parent_index.astype(np.int32).tolist(), [-1, 0, 1, 2, 3, 4, 4, 1, 7, 8, 9, 10, 10, 7, 13, 14, 15, 15]) "func(x," sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE ] next_info, input = to_decoder_input(sequence, annotation, grammar) self.assertEqual(next_info, (expr, 7)) "func(x, y)foo" sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE, Name ] result = to_decoder_input(sequence, annotation, grammar) self.assertEqual(result, None) "func2(x, y)" sequence = [ Root, Call, Expr, Name, Str, "func2", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE ] result = to_decoder_input(sequence, annotation, grammar) self.assertEqual(result, None) "func(--, y)" sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE ] result = to_decoder_input(sequence, annotation, grammar) self.assertEqual(result, None) "" result = to_decoder_input([], annotation, grammar) self.assertEqual(result[0], (ROOT, -1)) "func(" result = to_decoder_input( [Root, Call, Expr, Name, Str, "func", CLOSE_NODE], annotation, grammar) self.assertEqual(result[0], (expr_, 1)) "" result = to_decoder_input([Root, Call], annotation, grammar) self.assertEqual(result[0], (expr, 1))
def test_annotation(self): a = Annotation(["word1", "word2", "unknown"], {}) word_to_id = {"word1": 1, "word2": 2, "<unknown>": 3} result = to_encoder_input(a, word_to_id) self.assertEqual(result.query.tolist(), [1, 2, 3])
def test_read_sprot(self): sprot_list = read_sprot(self.blast_file, self.gff_file, self.fasta_file) expected = [Annotation("g.4830", "name", "mrh4"), Annotation("m.4830", "product", "ATP-dependent RNA helicase mrh4, mitochondrial")] self.assertEquals(sprot_list, expected)