def read_ipr(io_buffer, whitelist=None): """Returns a list of lists, each containing mrna_id, "Dbxref" and annotation.""" ipr_list = [] for line in io_buffer: columns = line.split("\t") #columns are assumed to be tab-separated #if column exists and dbxref is in whitelist (aside from whitespace padding and caps) if (len(columns)>3 and (columns[3].strip().lower() in whitelist)) or\ (len(columns)>3 and not whitelist): ipr_list.append( Annotation( columns[0].strip(), "Dbxref", columns[3].strip().upper() + ":" + columns[4].strip())) #if column exists (we don't care about the whitelist for GO annotations) if len(columns) > 13 and columns[13].find("GO:") != -1: ipr_list.append( Annotation(columns[0].strip(), "Dbxref", columns[13].strip())) #if column exists (we don't care about the whitelist for IPR annotations) if len(columns) > 11 and columns[11].find("IPR") != -1: ipr_list.append( Annotation(columns[0].strip(), "Dbxref", "InterPro:" + columns[11].strip())) #this alg removes duplicates ipr_list = sorted(ipr_list) ipr_list = [ ipr_list[i] for i in range(len(ipr_list)) if i == 0 or ipr_list[i] != ipr_list[i - 1] ] return ipr_list
def read_sprot(blast_file, gff_file, fasta_file): #retrieve relevant information from files fasta_info = get_fasta_info(fasta_file) gff_info = get_gff_info(gff_file) blast_info = get_blast_info(blast_file) sprot_list = [] for mrna, dbxref in blast_info.items(): #blast_info maps mrna's to dbxrefs if dbxref not in fasta_info: #these two if's shouldn't occur but just in case... print(mrna + " has dbxref " + dbxref + " that's not in the fasta. Skipping...") continue if mrna not in gff_info: print(mrna + " not in gff. Skipping...") continue #fasta_info maps dbxrefs to products and names product = fasta_info[dbxref][0] gene_name = fasta_info[dbxref][1] #gff_info maps mrna's to the parent gene id's gene_id = gff_info[mrna] #add annotations to annotation list sprot_list.append(Annotation(gene_id, "name", gene_name)) sprot_list.append(Annotation(mrna, "product", product)) return sprot_list
def test_read_sprot_missing_gene_name(self): self.fasta_file = io.StringIO(\ '>sp|Q5AZY1|MRH4_EMENI ATP-dependent RNA helicase mrh4, mitochondrial OS=Emericella nidulans (strain FGSC A4 / ATCC 38163 / CBS 112.46 / NRRL 194 / M139) PE=3 SV=1\n\ MNRLGGLSLPLRPVCLFCRAQTSLALSPLQGGQAVRSIATGRLRRRARMTLSKDVAKSSL\n\ KPKRTDRGKLGPFPNMNQTRARVREDPRSRSPAALKRSGETEEKPAMNTESPLYKALKMQ\n\ TALAPISYGKRTAIKAKIAEITSFDAFTLLPIVRNSIFSQALPGIADAVPTPIQRVAIPR\n\ LLEDAPAKKQAKKVDDDEPQYEQYLLAAETGSGKTLAYLIPVIDAIKRQEIQEKEMEKKE\n\ EERKVREREENKKNQAFDLEPEIPPPSNAGRPRAIILVPTAELVAQVGAKLKAFAHTVKF\n\ RSGIISSNLTPRRIKSTLFNPAGIDILVSTPHLLASIAKTDPYVLSRVSHLVLDEADSLM\n\ DRSFLPISTEVISKAAPSLQKLIFCSATIPRSLDSQLRKLYPDIWRLTTPNLHAIPRRVQ\n\ LGVVDIQKDPYRGNRNLACADVIWSIGKSGAGSDEAGSPWSEPKTKKILVFVNEREEADE\n\ VAQFLKSKGIDAHSFNRDSGTRKQEEILAEFTEPAAVPTAEEILLARKQQQRENINIPFV\n\ LPERTNRDTERRLDGVKVLVTTDIASRGIDTLALKTVILYHVPHTTIDFIHRLGRLGRMG\n\ KRGRAVVLVGKKDRKDVVKEVREVWFGLDS') sprot_list = read_sprot(self.blast_file, self.gff_file, self.fasta_file) expected = [Annotation("g.4830", "name", "MRH4"), Annotation("m.4830", "product", "ATP-dependent RNA helicase mrh4, mitochondrial")] self.assertEquals(sprot_list, expected)
def test_to_sequence(self): annotation = Annotation(["x", "y"], {}) "func(x, y)" sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE ] _, input = to_decoder_input(sequence, annotation, grammar) sequence2 = to_sequence(input, annotation, grammar) self.assertEqual(sequence2, sequence)
def find_objects_in_segmented_frames(frames: List[Tuple[int, np.array]], box_min_size: Tuple) -> List[Annotation]: annnotations = [] for i, (frame_idx, frame) in enumerate(frames): contours, _ = cv2.findContours(frame, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) for contour in contours: left, top, width, height = cv2.boundingRect(contour) if width > box_min_size[0] and height > box_min_size[1]: annnotations.append( Annotation( frame=frame_idx, left=left, top=top, width=left + width, height=top + height, label='car', )) return annnotations
def build_annotations(self, subcorpus, lu, frame): """ Builds annotations from a subcorpus. """ name = subcorpus.attrib["name"] annotations = [] for child in subcorpus.getchildren(): if "metaphor" in child.attrib.keys(): print(child.attrib) for c2 in child.getchildren(): tag = c2.tag.replace(self.replace_tag, "") if tag == "text": sentence = c2.text # print(sentence) # print("\n") elif tag == "annotationSet": if len(c2.attrib.keys()) > 3: print(c2.attrib) status = c2.attrib["status"] ID = int(c2.attrib["ID"]) if status in ["MANUAL", "AUTO_EDITED"]: new = Annotation(ID, status, sentence, name, lu, frame) for c3 in c2.getchildren(): tag = c3.tag.replace(self.replace_tag, "") if c3.attrib["name"] == "FE": for c4 in c3.getchildren(): tag = c4.tag.replace(self.replace_tag, "") name = c4.attrib[ "name" ] # .encode('utf-8') # Encode it, otherwise it breaks on Windows if "start" and "end" in c4.attrib: start, end = int(c4.attrib["start"]), int(c4.attrib["end"]) raw_text = new.sentence[start : end + 1].encode("utf-8").decode("utf-8") new.add_fe_mapping(name, raw_text) new.set_spans(name, (start, end)) else: new.add_fe_mapping(name, c4.attrib["itype"]) elif c3.attrib["name"] == "Sent": for c4 in c3.getchildren(): tag = c4.tag.replace(self.replace_tag, "") if c4.attrib["name"] == "Metaphor": new.set_metaphor() elif c3.attrib["name"] == "Target": for c4 in c3.getchildren(): if c4.attrib["name"] == "Target": start, end = int(c4.attrib["start"]), int(c4.attrib["end"]) new.set_target(new.sentence[start : end + 1]) annotations.append(new) # print(child.tag) return annotations
def main(argv = None): # hardcode defaults RESULT_DIR = '%s%sresult' % (sys.path[0], os.sep) PARAM_FILE = '%s%sparameter.conf' % (sys.path[0], os.sep) STEPS = ['preprocessing', 'annotate', 'assembly', 'analysis'] # Get the starting time starting_time = time.time() # setup Argument Parser for stdin arguments parser = argparse.ArgumentParser(add_help = True) # define arguments parser.add_argument('input', nargs = '+', action = 'store', help = 'single or paired input files in <fastq> format') parser.add_argument('--version', action = 'version', version = '%(prog)s 0.5') parser.add_argument('-v', dest = 'verbose', action = 'store_true', default = False, help = 'more detailed output (default = False)') parser.add_argument('-t', dest = 'threads', type = int, action = 'store', default = multiprocessing.cpu_count() - 1, help = 'number of threads to use (default = %d)' % (multiprocessing.cpu_count() - 1)) parser.add_argument('-p', dest = 'param', action = 'store', default = PARAM_FILE, help = 'use alternative config file (default = parameter.conf)') parser.add_argument('-s', dest = 'skip', action = 'store', default = '', choices = ['preprocessing', 'assembly', 'annotation','analysis'], help = 'skip steps in the pipeline (default = None)') parser.add_argument('-o', dest = 'output', action = 'store', default = RESULT_DIR, help = 'use alternative output folder') parser.add_argument('-a', dest = 'assembler', default = 'MetaVelvet', choices = ['metavelvet', 'flash','both'], help = 'assembling program to use (default = MetaVelvet)') parser.add_argument('-c', dest = 'annotation', default = 'both', choices = ['metacv', 'blastn', 'both'], help = 'classifier to use for annotation (default = both)') parser.add_argument('--use_contigs', dest = 'use_contigs', action = 'store_true', default = 'False', help = 'should MetaCV use assembled Reads or RAW Reads (default = RAW') parser.add_argument('--notrimming', dest = 'trim', action = 'store_false', default = True, help = 'trim and filter input reads? (default = True)') parser.add_argument('--noquality', dest = 'quality', action = 'store_false', default = True, help = 'create no quality report (default = True)') parser.add_argument('--noreport', dest = 'krona', action = 'store_false', default = True, help = 'create no pie chart with the annotated taxonomical data (default = True)') parser.add_argument('--merge', dest = 'merge_uncombined', action = 'store_true', default = False, help = 'merge concatinated reads with not concatinated (default = False)') args = parser.parse_args() # init the Pipeline RESULT_DIR = args.output if args.output else RESULT_DIR # check if param File exists if os.path.isfile(args.param): PARAM_FILE = args.param else: if os.path.isfile(PARAM_FILE): sys.stderr.write('ERROR 3: Parameter File could not be found!\n') sys.stderr.write('Use standard Parameter File:\n%s\n\n' % (PARAM_FILE)) else: raise ParamFileNotFound(args.param) # check if input exists if not all(os.path.isfile(file) for file in args.input): raise InputNotFound(to_string(args.input)) if __name__ == '__main__': # create outputdir and log folder create_outputdir(RESULT_DIR) create_outputdir(RESULT_DIR + os.sep +'log') # create the global settings object settings = General(args.threads, args.verbose, args.skip, starting_time, args.trim, args.quality, args.krona, args.use_contigs, args.merge_uncombined, args.assembler, args.annotation, 1) # setup the input, outputs and important files files = FileSettings(absolute_path(args.input), os.path.normpath(RESULT_DIR), PARAM_FILE) exe = Executables(PARAM_FILE) # get the all skipped steps skip = to_string(settings.get_skip()) try: print "hello" # START the modules of Pipeline and wait until completion if skip in 'preprocessing' and skip: skip_msg(skip) else: # init the preprocessing module pre = Preprocess(settings.get_threads(), settings.get_step_number(), settings.get_verbose(), settings.get_actual_time(), files.get_input(), files.get_logdir(), exe.get_FastQC(), settings.get_quality(), files.get_quality_dir(), parse_parameter(FastQC_Parameter(PARAM_FILE)), exe.get_TrimGalore(), settings.get_trim(), files.get_trim_dir(), parse_parameter(TrimGalore_Parameter(PARAM_FILE))) # run preprocessing functions results = pre.manage_preprocessing() # update pipeline variables with results settings.set_step_number(results[0]) if len(results) > 1: files.set_input(absolute_path(results[1])) files.set_preprocessed_output(absolute_path(results[1])) if skip in 'assembly' and skip: skip_msg(skip) else: # init the assembly module assembly = Assembly(settings.get_threads(), settings.get_step_number(), settings.get_verbose(), settings.get_actual_time(), files.get_logdir(), files.get_input(), settings.get_assembler(), exe.get_Flash(), files.get_concat_dir(), parse_parameter(FLASH_Parameter(PARAM_FILE)), settings.get_merge_uncombined(), exe.get_Velveth(), exe.get_Velvetg(), exe.get_MetaVelvet(), files.get_assembly_dir(), Velveth_Parameter(PARAM_FILE).get_kmer(PARAM_FILE), parse_parameter(Velveth_Parameter(PARAM_FILE)), parse_parameter(Velvetg_Parameter(PARAM_FILE)), parse_parameter(MetaVelvet_Parameter(PARAM_FILE))) # run assembly functions results = assembly.manage_assembly() # update pipeline variables with results settings.set_step_number(results[0]) files.set_input(absolute_path(results[1])) files.set_concatinated_output(absolute_path(results[2])) files.set_assembled_output(absolute_path(results[3])) if skip in 'annotation'and skip: skip_msg(skip) else: # init the annotation module anno = Annotation(settings.get_threads(), settings.get_step_number(), settings.get_verbose(), settings.get_actual_time(), files.get_logdir(), files.get_input(), files.get_raw(), settings.get_annotation(), settings.get_use_contigs(), exe.get_Blastn(), exe.get_Blastn_DB(), exe.get_Converter(), files.get_blastn_dir(), Blastn_Parameter(PARAM_FILE).outfmt, parse_parameter(Blastn_Parameter(PARAM_FILE)), exe.get_MetaCV(), exe.get_MetaCV_DB(), files.get_metacv_dir(), MetaCV_Parameter(PARAM_FILE).get_seq(), MetaCV_Parameter(PARAM_FILE).get_mode(), MetaCV_Parameter(PARAM_FILE).get_orf(), MetaCV_Parameter(PARAM_FILE).get_total_reads(), MetaCV_Parameter(PARAM_FILE).get_min_qual(), MetaCV_Parameter(PARAM_FILE).get_taxon(), MetaCV_Parameter(PARAM_FILE).get_name()) # run the annotation functions results = anno.manage_annotation() settings.set_step_number(results[0]) files.set_blastn_output(absolute_path(results[1])) files.set_metacv_output(absolute_path(results[2])) if skip in 'analysis' and skip: skip_msg(skip) else: # init the analysis module analysis = Analysis(settings.get_threads(), settings.get_step_number(), settings.get_verbose(), settings.get_actual_time(), files.get_logdir(), settings.get_annotation(), files.get_output(), files.get_parsed_db_dir(), files.get_annotated_db_dir(), files.get_subseted_db_dir(), files.get_krona_report_dir(), files.get_blastn_output(), files.get_metacv_output(), exe.get_Parser(), parse_parameter(blastParser_Parameter(PARAM_FILE)), blastParser_Parameter(PARAM_FILE).get_name(), exe.get_Annotate(), parse_parameter(Rannotate_Parameter(PARAM_FILE)), Rannotate_Parameter(PARAM_FILE).get_name(), Rannotate_Parameter(PARAM_FILE).get_taxon_db(), exe.get_Subset(), subsetDB_Parameter(PARAM_FILE).get_bitscore(), subsetDB_Parameter(PARAM_FILE).get_classifier(), subsetDB_Parameter(PARAM_FILE).get_rank(), subsetDB_Parameter(PARAM_FILE).get_taxon_db(), exe.get_Krona_Blast(), parse_parameter(Krona_Parameter(PARAM_FILE)), Krona_Parameter(PARAM_FILE).get_name(), settings.get_krona(), exe.get_Perl_lib()) # run the analysis function results = analysis.manage_analysis() files.set_parser_output(absolute_path(results[0])) files.set_annotated_output(absolute_path(results[1])) except KeyboardInterrupt: sys.stdout.write('\nERROR 1 : Operation cancelled by User!\n') sys.exit(1) # print ending message print_verbose('\nPIPELINE COMPLETE!\n\n') print_running_time(settings.get_actual_time())
def test_to_decoder_input(self): annotation = Annotation(["x", "y"], {}) "func(x, y)" sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE ] next_node_type, input = to_decoder_input(sequence, annotation, grammar) self.assertEqual(next_node_type, None) self.assertEqual( input.action.astype(np.int32).tolist(), [[0, 0, 0], [1, 0, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 0, 0], [0, 2, 0], [4, 0, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 1, 0], [0, 2, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 0, 1], [0, 2, 0]]) self.assertEqual( input.action_type[:, 0].astype(np.int32).tolist(), [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0]) self.assertEqual( input.action_type[:, 1].astype(np.int32).tolist(), [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1]) self.assertEqual( input.action_type[:, 2].astype(np.int32).tolist(), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]) self.assertEqual( input.node_type.astype(np.int32).tolist(), [0, 1, 2, 4, 5, 6, 6, 3, 2, 4, 5, 6, 6, 2, 4, 5, 6, 6]) self.assertEqual( input.parent_action.astype(np.int32).tolist(), [0, 0, 1, 5, 2, 3, 3, 1, 4, 5, 2, 3, 3, 4, 5, 2, 3, 3]) self.assertEqual( input.parent_index.astype(np.int32).tolist(), [-1, 0, 1, 2, 3, 4, 4, 1, 7, 8, 9, 10, 10, 7, 13, 14, 15, 15]) "func(x," sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE ] next_info, input = to_decoder_input(sequence, annotation, grammar) self.assertEqual(next_info, (expr, 7)) "func(x, y)foo" sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE, Name ] result = to_decoder_input(sequence, annotation, grammar) self.assertEqual(result, None) "func2(x, y)" sequence = [ Root, Call, Expr, Name, Str, "func2", CLOSE_NODE, Expand2, Expr, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE ] result = to_decoder_input(sequence, annotation, grammar) self.assertEqual(result, None) "func(--, y)" sequence = [ Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE ] result = to_decoder_input(sequence, annotation, grammar) self.assertEqual(result, None) "" result = to_decoder_input([], annotation, grammar) self.assertEqual(result[0], (ROOT, -1)) "func(" result = to_decoder_input( [Root, Call, Expr, Name, Str, "func", CLOSE_NODE], annotation, grammar) self.assertEqual(result[0], (expr_, 1)) "" result = to_decoder_input([Root, Call], annotation, grammar) self.assertEqual(result[0], (expr, 1))
def test_annotation(self): a = Annotation(["word1", "word2", "unknown"], {}) word_to_id = {"word1": 1, "word2": 2, "<unknown>": 3} result = to_encoder_input(a, word_to_id) self.assertEqual(result.query.tolist(), [1, 2, 3])
def build_annotations(self, subcorpus, lu, frame): """ Builds annotations from a subcorpus. """ name = subcorpus.attrib['name'] annotations = [] for child in subcorpus.getchildren(): if "metaphor" in child.attrib.keys(): print(child.attrib) for c2 in child.getchildren(): tag = c2.tag.replace(self.replace_tag, "") if tag == "text": sentence = c2.text #print(sentence) #print("\n") elif tag == "annotationSet": if len(c2.attrib.keys()) > 3: print(c2.attrib) status = c2.attrib['status'] ID = int(c2.attrib['ID']) if status in ["MANUAL", "AUTO_EDITED"]: new = Annotation(ID, status, sentence, name, lu, frame) for c3 in c2.getchildren(): tag = c3.tag.replace(self.replace_tag, "") if c3.attrib['name'] == "FE": for c4 in c3.getchildren(): tag = c4.tag.replace(self.replace_tag, "") name = c4.attrib[ 'name'] #.encode('utf-8') # Encode it, otherwise it breaks on Windows if 'start' and 'end' in c4.attrib: start, end = int( c4.attrib['start']), int( c4.attrib['end']) raw_text = new.sentence[ start:end + 1].encode('utf-8').decode('utf-8') new.add_fe_mapping(name, raw_text) new.set_spans(name, (start, end)) else: new.add_fe_mapping( name, c4.attrib['itype']) elif c3.attrib['name'] == "Sent": for c4 in c3.getchildren(): tag = c4.tag.replace(self.replace_tag, "") if c4.attrib['name'] == "Metaphor": new.set_metaphor() elif c3.attrib['name'] == "Target": for c4 in c3.getchildren(): if c4.attrib['name'] == "Target": start, end = int( c4.attrib['start']), int( c4.attrib['end']) new.set_target(new.sentence[start:end + 1]) annotations.append(new) #print(child.tag) return annotations
def test_read_sprot(self): sprot_list = read_sprot(self.blast_file, self.gff_file, self.fasta_file) expected = [Annotation("g.4830", "name", "mrh4"), Annotation("m.4830", "product", "ATP-dependent RNA helicase mrh4, mitochondrial")] self.assertEquals(sprot_list, expected)