示例#1
0
文件: ipr.py 项目: darcyabjones/annie
def read_ipr(io_buffer, whitelist=None):
    """Returns a list of lists, each containing mrna_id, "Dbxref" and annotation."""
    ipr_list = []
    for line in io_buffer:
        columns = line.split("\t")  #columns are assumed to be tab-separated
        #if column exists and dbxref is in whitelist (aside from whitespace padding and caps)
        if (len(columns)>3 and (columns[3].strip().lower() in whitelist)) or\
                (len(columns)>3 and not whitelist):
            ipr_list.append(
                Annotation(
                    columns[0].strip(), "Dbxref",
                    columns[3].strip().upper() + ":" + columns[4].strip()))
        #if column exists (we don't care about the whitelist for GO annotations)
        if len(columns) > 13 and columns[13].find("GO:") != -1:
            ipr_list.append(
                Annotation(columns[0].strip(), "Dbxref", columns[13].strip()))
        #if column exists (we don't care about the whitelist for IPR annotations)
        if len(columns) > 11 and columns[11].find("IPR") != -1:
            ipr_list.append(
                Annotation(columns[0].strip(), "Dbxref",
                           "InterPro:" + columns[11].strip()))

    #this alg removes duplicates
    ipr_list = sorted(ipr_list)
    ipr_list = [
        ipr_list[i] for i in range(len(ipr_list))
        if i == 0 or ipr_list[i] != ipr_list[i - 1]
    ]

    return ipr_list
示例#2
0
文件: sprot.py 项目: ikmb/esga
def read_sprot(blast_file, gff_file, fasta_file):
    #retrieve relevant information from files
    fasta_info = get_fasta_info(fasta_file)
    gff_info = get_gff_info(gff_file)
    blast_info = get_blast_info(blast_file)

    sprot_list = []
    for mrna, dbxref in blast_info.items():  #blast_info maps mrna's to dbxrefs
        if dbxref not in fasta_info:  #these two if's shouldn't occur but just in case...
            print(mrna + " has dbxref " + dbxref +
                  " that's not in the fasta. Skipping...")
            continue
        if mrna not in gff_info:
            print(mrna + " not in gff. Skipping...")
            continue

        #fasta_info maps dbxrefs to products and names
        product = fasta_info[dbxref][0]
        gene_name = fasta_info[dbxref][1]
        #gff_info maps mrna's to the parent gene id's
        gene_id = gff_info[mrna]

        #add annotations to annotation list
        sprot_list.append(Annotation(gene_id, "name", gene_name))
        sprot_list.append(Annotation(mrna, "product", product))
    return sprot_list
示例#3
0
    def test_read_sprot_missing_gene_name(self):
        self.fasta_file = io.StringIO(\
                '>sp|Q5AZY1|MRH4_EMENI ATP-dependent RNA helicase mrh4, mitochondrial OS=Emericella nidulans (strain FGSC A4 / ATCC 38163 / CBS 112.46 / NRRL 194 / M139) PE=3 SV=1\n\
MNRLGGLSLPLRPVCLFCRAQTSLALSPLQGGQAVRSIATGRLRRRARMTLSKDVAKSSL\n\
KPKRTDRGKLGPFPNMNQTRARVREDPRSRSPAALKRSGETEEKPAMNTESPLYKALKMQ\n\
TALAPISYGKRTAIKAKIAEITSFDAFTLLPIVRNSIFSQALPGIADAVPTPIQRVAIPR\n\
LLEDAPAKKQAKKVDDDEPQYEQYLLAAETGSGKTLAYLIPVIDAIKRQEIQEKEMEKKE\n\
EERKVREREENKKNQAFDLEPEIPPPSNAGRPRAIILVPTAELVAQVGAKLKAFAHTVKF\n\
RSGIISSNLTPRRIKSTLFNPAGIDILVSTPHLLASIAKTDPYVLSRVSHLVLDEADSLM\n\
DRSFLPISTEVISKAAPSLQKLIFCSATIPRSLDSQLRKLYPDIWRLTTPNLHAIPRRVQ\n\
LGVVDIQKDPYRGNRNLACADVIWSIGKSGAGSDEAGSPWSEPKTKKILVFVNEREEADE\n\
VAQFLKSKGIDAHSFNRDSGTRKQEEILAEFTEPAAVPTAEEILLARKQQQRENINIPFV\n\
LPERTNRDTERRLDGVKVLVTTDIASRGIDTLALKTVILYHVPHTTIDFIHRLGRLGRMG\n\
KRGRAVVLVGKKDRKDVVKEVREVWFGLDS')
        sprot_list = read_sprot(self.blast_file, self.gff_file, self.fasta_file)
        expected = [Annotation("g.4830", "name", "MRH4"), Annotation("m.4830", "product", "ATP-dependent RNA helicase mrh4, mitochondrial")]
        self.assertEquals(sprot_list, expected)
示例#4
0
    def build_annotations(self, subcorpus, lu, frame):
        """ Builds annotations from a subcorpus. """
        name = subcorpus.attrib['name']
        annotations = []
        for child in subcorpus.getchildren():
            if "metaphor" in child.attrib.keys():
                print(child.attrib)
            for c2 in child.getchildren():
                tag = c2.tag.replace(self.replace_tag, "")
                if tag == "text":
                    sentence = c2.text
                    #print(sentence)
                    #print("\n")
                elif tag == "annotationSet":
                    if len(c2.attrib.keys()) > 3:
                        print(c2.attrib)
                    status = c2.attrib['status']
                    ID = int(c2.attrib['ID'])
                    if status in ["MANUAL", "AUTO_EDITED"]:
                        new = Annotation(ID, status, sentence, name, lu, frame)

                        for c3 in c2.getchildren():
                            tag = c3.tag.replace(self.replace_tag, "")

                            if c3.attrib['name'] == "FE":
                                for c4 in c3.getchildren():
                                    tag = c4.tag.replace(self.replace_tag, "")
                                    name = c4.attrib[
                                        'name']  #.encode('utf-8') # Encode it, otherwise it breaks on Windows
                                    if 'start' and 'end' in c4.attrib:
                                        start, end = int(
                                            c4.attrib['start']), int(
                                                c4.attrib['end'])
                                        raw_text = new.sentence[
                                            start:end +
                                            1].encode('utf-8').decode('utf-8')
                                        new.add_fe_mapping(name, raw_text)
                                        new.set_spans(name, (start, end))
                                    else:
                                        new.add_fe_mapping(
                                            name, c4.attrib['itype'])
                            elif c3.attrib['name'] == "Sent":
                                for c4 in c3.getchildren():
                                    tag = c4.tag.replace(self.replace_tag, "")
                                    if c4.attrib['name'] == "Metaphor":
                                        new.set_metaphor()
                            elif c3.attrib['name'] == "Target":
                                for c4 in c3.getchildren():
                                    if c4.attrib['name'] == "Target":
                                        start, end = int(
                                            c4.attrib['start']), int(
                                                c4.attrib['end'])
                                        new.set_target(new.sentence[start:end +
                                                                    1])

                        annotations.append(new)
            #print(child.tag)
        return annotations
示例#5
0
 def test_to_sequence(self):
     annotation = Annotation(["x", "y"], {})
     "func(x, y)"
     sequence = [
         Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr,
         Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE
     ]
     _, input = to_decoder_input(sequence, annotation, grammar)
     sequence2 = to_sequence(input, annotation, grammar)
     self.assertEqual(sequence2, sequence)
示例#6
0
def find_objects_in_segmented_frames(frames: List[Tuple[int, np.array]],
                                     box_min_size: Tuple) -> List[Annotation]:

    annnotations = []

    for i, (frame_idx, frame) in enumerate(frames):
        contours, _ = cv2.findContours(frame, cv2.RETR_EXTERNAL,
                                       cv2.CHAIN_APPROX_NONE)

        for contour in contours:
            left, top, width, height = cv2.boundingRect(contour)

            if width > box_min_size[0] and height > box_min_size[1]:
                annnotations.append(
                    Annotation(
                        frame=frame_idx,
                        left=left,
                        top=top,
                        width=left + width,
                        height=top + height,
                        label='car',
                    ))

    return annnotations
示例#7
0
    def test_to_decoder_input(self):
        annotation = Annotation(["x", "y"], {})

        "func(x, y)"
        sequence = [
            Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr,
            Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE
        ]
        next_node_type, input = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(next_node_type, None)
        self.assertEqual(
            input.action.astype(np.int32).tolist(),
            [[0, 0, 0], [1, 0, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 0, 0],
             [0, 2, 0], [4, 0, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 1, 0],
             [0, 2, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 0, 1], [0, 2, 0]])
        self.assertEqual(
            input.action_type[:, 0].astype(np.int32).tolist(),
            [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0])
        self.assertEqual(
            input.action_type[:, 1].astype(np.int32).tolist(),
            [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1])
        self.assertEqual(
            input.action_type[:, 2].astype(np.int32).tolist(),
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0])
        self.assertEqual(
            input.node_type.astype(np.int32).tolist(),
            [0, 1, 2, 4, 5, 6, 6, 3, 2, 4, 5, 6, 6, 2, 4, 5, 6, 6])
        self.assertEqual(
            input.parent_action.astype(np.int32).tolist(),
            [0, 0, 1, 5, 2, 3, 3, 1, 4, 5, 2, 3, 3, 4, 5, 2, 3, 3])
        self.assertEqual(
            input.parent_index.astype(np.int32).tolist(),
            [-1, 0, 1, 2, 3, 4, 4, 1, 7, 8, 9, 10, 10, 7, 13, 14, 15, 15])

        "func(x,"
        sequence = [
            Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr,
            Name, Str, "x", CLOSE_NODE
        ]
        next_info, input = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(next_info, (expr, 7))

        "func(x, y)foo"
        sequence = [
            Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr,
            Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE, Name
        ]
        result = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(result, None)

        "func2(x, y)"
        sequence = [
            Root, Call, Expr, Name, Str, "func2", CLOSE_NODE, Expand2, Expr,
            Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE
        ]
        result = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(result, None)

        "func(--, y)"
        sequence = [
            Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Name,
            Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE
        ]
        result = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(result, None)

        ""
        result = to_decoder_input([], annotation, grammar)
        self.assertEqual(result[0], (ROOT, -1))

        "func("
        result = to_decoder_input(
            [Root, Call, Expr, Name, Str, "func", CLOSE_NODE], annotation,
            grammar)
        self.assertEqual(result[0], (expr_, 1))
        ""
        result = to_decoder_input([Root, Call], annotation, grammar)
        self.assertEqual(result[0], (expr, 1))
示例#8
0
 def test_annotation(self):
     a = Annotation(["word1", "word2", "unknown"], {})
     word_to_id = {"word1": 1, "word2": 2, "<unknown>": 3}
     result = to_encoder_input(a, word_to_id)
     self.assertEqual(result.query.tolist(), [1, 2, 3])
示例#9
0
 def test_read_sprot(self):
     sprot_list = read_sprot(self.blast_file, self.gff_file, self.fasta_file)
     expected = [Annotation("g.4830", "name", "mrh4"), Annotation("m.4830", "product", "ATP-dependent RNA helicase mrh4, mitochondrial")]
     self.assertEquals(sprot_list, expected)