Пример #1
0
def read_ipr(io_buffer, whitelist=None):
    """Returns a list of lists, each containing mrna_id, "Dbxref" and annotation."""
    ipr_list = []
    for line in io_buffer:
        columns = line.split("\t")  #columns are assumed to be tab-separated
        #if column exists and dbxref is in whitelist (aside from whitespace padding and caps)
        if (len(columns)>3 and (columns[3].strip().lower() in whitelist)) or\
                (len(columns)>3 and not whitelist):
            ipr_list.append(
                Annotation(
                    columns[0].strip(), "Dbxref",
                    columns[3].strip().upper() + ":" + columns[4].strip()))
        #if column exists (we don't care about the whitelist for GO annotations)
        if len(columns) > 13 and columns[13].find("GO:") != -1:
            ipr_list.append(
                Annotation(columns[0].strip(), "Dbxref", columns[13].strip()))
        #if column exists (we don't care about the whitelist for IPR annotations)
        if len(columns) > 11 and columns[11].find("IPR") != -1:
            ipr_list.append(
                Annotation(columns[0].strip(), "Dbxref",
                           "InterPro:" + columns[11].strip()))

    #this alg removes duplicates
    ipr_list = sorted(ipr_list)
    ipr_list = [
        ipr_list[i] for i in range(len(ipr_list))
        if i == 0 or ipr_list[i] != ipr_list[i - 1]
    ]

    return ipr_list
Пример #2
0
Файл: sprot.py Проект: ikmb/esga
def read_sprot(blast_file, gff_file, fasta_file):
    #retrieve relevant information from files
    fasta_info = get_fasta_info(fasta_file)
    gff_info = get_gff_info(gff_file)
    blast_info = get_blast_info(blast_file)

    sprot_list = []
    for mrna, dbxref in blast_info.items():  #blast_info maps mrna's to dbxrefs
        if dbxref not in fasta_info:  #these two if's shouldn't occur but just in case...
            print(mrna + " has dbxref " + dbxref +
                  " that's not in the fasta. Skipping...")
            continue
        if mrna not in gff_info:
            print(mrna + " not in gff. Skipping...")
            continue

        #fasta_info maps dbxrefs to products and names
        product = fasta_info[dbxref][0]
        gene_name = fasta_info[dbxref][1]
        #gff_info maps mrna's to the parent gene id's
        gene_id = gff_info[mrna]

        #add annotations to annotation list
        sprot_list.append(Annotation(gene_id, "name", gene_name))
        sprot_list.append(Annotation(mrna, "product", product))
    return sprot_list
Пример #3
0
    def test_read_sprot_missing_gene_name(self):
        self.fasta_file = io.StringIO(\
                '>sp|Q5AZY1|MRH4_EMENI ATP-dependent RNA helicase mrh4, mitochondrial OS=Emericella nidulans (strain FGSC A4 / ATCC 38163 / CBS 112.46 / NRRL 194 / M139) PE=3 SV=1\n\
MNRLGGLSLPLRPVCLFCRAQTSLALSPLQGGQAVRSIATGRLRRRARMTLSKDVAKSSL\n\
KPKRTDRGKLGPFPNMNQTRARVREDPRSRSPAALKRSGETEEKPAMNTESPLYKALKMQ\n\
TALAPISYGKRTAIKAKIAEITSFDAFTLLPIVRNSIFSQALPGIADAVPTPIQRVAIPR\n\
LLEDAPAKKQAKKVDDDEPQYEQYLLAAETGSGKTLAYLIPVIDAIKRQEIQEKEMEKKE\n\
EERKVREREENKKNQAFDLEPEIPPPSNAGRPRAIILVPTAELVAQVGAKLKAFAHTVKF\n\
RSGIISSNLTPRRIKSTLFNPAGIDILVSTPHLLASIAKTDPYVLSRVSHLVLDEADSLM\n\
DRSFLPISTEVISKAAPSLQKLIFCSATIPRSLDSQLRKLYPDIWRLTTPNLHAIPRRVQ\n\
LGVVDIQKDPYRGNRNLACADVIWSIGKSGAGSDEAGSPWSEPKTKKILVFVNEREEADE\n\
VAQFLKSKGIDAHSFNRDSGTRKQEEILAEFTEPAAVPTAEEILLARKQQQRENINIPFV\n\
LPERTNRDTERRLDGVKVLVTTDIASRGIDTLALKTVILYHVPHTTIDFIHRLGRLGRMG\n\
KRGRAVVLVGKKDRKDVVKEVREVWFGLDS')
        sprot_list = read_sprot(self.blast_file, self.gff_file, self.fasta_file)
        expected = [Annotation("g.4830", "name", "MRH4"), Annotation("m.4830", "product", "ATP-dependent RNA helicase mrh4, mitochondrial")]
        self.assertEquals(sprot_list, expected)
Пример #4
0
 def test_to_sequence(self):
     annotation = Annotation(["x", "y"], {})
     "func(x, y)"
     sequence = [
         Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr,
         Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE
     ]
     _, input = to_decoder_input(sequence, annotation, grammar)
     sequence2 = to_sequence(input, annotation, grammar)
     self.assertEqual(sequence2, sequence)
Пример #5
0
def find_objects_in_segmented_frames(frames: List[Tuple[int, np.array]],
                                     box_min_size: Tuple) -> List[Annotation]:

    annnotations = []

    for i, (frame_idx, frame) in enumerate(frames):
        contours, _ = cv2.findContours(frame, cv2.RETR_EXTERNAL,
                                       cv2.CHAIN_APPROX_NONE)

        for contour in contours:
            left, top, width, height = cv2.boundingRect(contour)

            if width > box_min_size[0] and height > box_min_size[1]:
                annnotations.append(
                    Annotation(
                        frame=frame_idx,
                        left=left,
                        top=top,
                        width=left + width,
                        height=top + height,
                        label='car',
                    ))

    return annnotations
Пример #6
0
    def build_annotations(self, subcorpus, lu, frame):
        """ Builds annotations from a subcorpus. """
        name = subcorpus.attrib["name"]
        annotations = []
        for child in subcorpus.getchildren():
            if "metaphor" in child.attrib.keys():
                print(child.attrib)
            for c2 in child.getchildren():
                tag = c2.tag.replace(self.replace_tag, "")
                if tag == "text":
                    sentence = c2.text
                    # print(sentence)
                    # print("\n")
                elif tag == "annotationSet":
                    if len(c2.attrib.keys()) > 3:
                        print(c2.attrib)
                    status = c2.attrib["status"]
                    ID = int(c2.attrib["ID"])
                    if status in ["MANUAL", "AUTO_EDITED"]:
                        new = Annotation(ID, status, sentence, name, lu, frame)

                        for c3 in c2.getchildren():
                            tag = c3.tag.replace(self.replace_tag, "")

                            if c3.attrib["name"] == "FE":
                                for c4 in c3.getchildren():
                                    tag = c4.tag.replace(self.replace_tag, "")
                                    name = c4.attrib[
                                        "name"
                                    ]  # .encode('utf-8') # Encode it, otherwise it breaks on Windows
                                    if "start" and "end" in c4.attrib:
                                        start, end = int(c4.attrib["start"]), int(c4.attrib["end"])
                                        raw_text = new.sentence[start : end + 1].encode("utf-8").decode("utf-8")
                                        new.add_fe_mapping(name, raw_text)
                                        new.set_spans(name, (start, end))
                                    else:
                                        new.add_fe_mapping(name, c4.attrib["itype"])
                            elif c3.attrib["name"] == "Sent":
                                for c4 in c3.getchildren():
                                    tag = c4.tag.replace(self.replace_tag, "")
                                    if c4.attrib["name"] == "Metaphor":
                                        new.set_metaphor()
                            elif c3.attrib["name"] == "Target":
                                for c4 in c3.getchildren():
                                    if c4.attrib["name"] == "Target":
                                        start, end = int(c4.attrib["start"]), int(c4.attrib["end"])
                                        new.set_target(new.sentence[start : end + 1])

                        annotations.append(new)
            # print(child.tag)
        return annotations
Пример #7
0
def main(argv = None):

  # hardcode defaults
  RESULT_DIR = '%s%sresult' % (sys.path[0], os.sep)
  PARAM_FILE = '%s%sparameter.conf' % (sys.path[0], os.sep)
  STEPS = ['preprocessing', 'annotate', 'assembly', 'analysis']

  # Get the starting time
  starting_time = time.time()

  # setup Argument Parser for stdin arguments
  parser = argparse.ArgumentParser(add_help = True)

  # define arguments
  parser.add_argument('input', nargs = '+', action = 'store', 
                      help = 'single or paired input files in <fastq> format')
  parser.add_argument('--version', action = 'version', version = '%(prog)s 0.5')
  parser.add_argument('-v', dest = 'verbose', action = 'store_true', default = False,
                      help = 'more detailed output (default = False)')
  parser.add_argument('-t', dest = 'threads', type = int, action = 'store', 
                      default = multiprocessing.cpu_count() - 1,
                      help = 'number of threads to use (default = %d)' 
                      % (multiprocessing.cpu_count() - 1))
  parser.add_argument('-p', dest = 'param', action = 'store', default = PARAM_FILE,
                      help = 'use alternative config file (default = parameter.conf)')
  parser.add_argument('-s', dest = 'skip', action = 'store', default = '', 
                      choices = ['preprocessing', 'assembly', 'annotation','analysis'],
                      help = 'skip steps in the pipeline (default = None)')
  parser.add_argument('-o', dest = 'output', action = 'store', default = RESULT_DIR,
                      help = 'use alternative output folder')
  parser.add_argument('-a', dest = 'assembler', default = 'MetaVelvet', 
                      choices = ['metavelvet', 'flash','both'],
                      help = 'assembling program to use (default = MetaVelvet)')
  parser.add_argument('-c', dest = 'annotation', default = 'both',
                      choices = ['metacv', 'blastn', 'both'],
                      help = 'classifier to use for annotation (default = both)')     
  parser.add_argument('--use_contigs', dest = 'use_contigs', action = 'store_true', 
                      default = 'False',
                      help = 'should MetaCV use assembled Reads or RAW Reads (default = RAW')                                    
  parser.add_argument('--notrimming', dest = 'trim', action = 'store_false', default = True,
                      help = 'trim and filter input reads? (default = True)')
  parser.add_argument('--noquality', dest = 'quality', action = 'store_false', default = True,
                    help = 'create no quality report (default = True)')
  parser.add_argument('--noreport', dest = 'krona', action = 'store_false', default = True,
                      help = 'create no pie chart with the annotated taxonomical data (default = True)')
  parser.add_argument('--merge', dest = 'merge_uncombined', action = 'store_true', default = False,
                      help = 'merge concatinated reads with not concatinated (default = False)')

  args = parser.parse_args()
  # init the Pipeline
  RESULT_DIR = args.output if args.output else RESULT_DIR
  # check if param File exists
  if os.path.isfile(args.param):
      PARAM_FILE = args.param
  else:
      if os.path.isfile(PARAM_FILE):
        sys.stderr.write('ERROR 3: Parameter File could not be found!\n')
        sys.stderr.write('Use standard Parameter File:\n%s\n\n' % (PARAM_FILE))
      else:
          raise ParamFileNotFound(args.param)
    
  # check if input exists
  if not all(os.path.isfile(file) for file in args.input):
      raise InputNotFound(to_string(args.input))

  if __name__ == '__main__':   

    # create outputdir and log folder
    create_outputdir(RESULT_DIR)
    create_outputdir(RESULT_DIR + os.sep +'log')

    # create the global settings object
    settings = General(args.threads, args.verbose, args.skip, starting_time, args.trim, 
                       args.quality, args.krona, args.use_contigs, args.merge_uncombined, args.assembler, 
                       args.annotation, 1)

    # setup the input, outputs and important files
    files = FileSettings(absolute_path(args.input), os.path.normpath(RESULT_DIR), PARAM_FILE)

    exe = Executables(PARAM_FILE)
    # get the all skipped steps
    skip = to_string(settings.get_skip())

    try:
      print "hello"
      # START the modules of Pipeline and wait until completion
      if skip in 'preprocessing' and skip:
          skip_msg(skip)
      else:
          # init the preprocessing module
          pre = Preprocess(settings.get_threads(), 
                           settings.get_step_number(),
                           settings.get_verbose(),
                           settings.get_actual_time(),
                           files.get_input(),
                           files.get_logdir(),
                           exe.get_FastQC(),
                           settings.get_quality(),
                           files.get_quality_dir(),
                           parse_parameter(FastQC_Parameter(PARAM_FILE)),
                           exe.get_TrimGalore(),
                           settings.get_trim(),
                           files.get_trim_dir(), 
                           parse_parameter(TrimGalore_Parameter(PARAM_FILE)))
          # run preprocessing functions
          results = pre.manage_preprocessing()
          # update pipeline variables with results
          settings.set_step_number(results[0])
          if len(results) > 1:
              files.set_input(absolute_path(results[1]))
              files.set_preprocessed_output(absolute_path(results[1]))

      if skip in 'assembly' and skip:
        skip_msg(skip)
      else:
        # init the assembly module 
        assembly = Assembly(settings.get_threads(), 
                            settings.get_step_number(),
                            settings.get_verbose(),
                            settings.get_actual_time(),
                            files.get_logdir(),
                            files.get_input(),
                            settings.get_assembler(),
                            exe.get_Flash(),
                            files.get_concat_dir(),
                            parse_parameter(FLASH_Parameter(PARAM_FILE)),
                            settings.get_merge_uncombined(),
                            exe.get_Velveth(),
                            exe.get_Velvetg(),
                            exe.get_MetaVelvet(),
                            files.get_assembly_dir(),
                            Velveth_Parameter(PARAM_FILE).get_kmer(PARAM_FILE),
                            parse_parameter(Velveth_Parameter(PARAM_FILE)),
                            parse_parameter(Velvetg_Parameter(PARAM_FILE)),
                            parse_parameter(MetaVelvet_Parameter(PARAM_FILE)))
        # run assembly functions
        results = assembly.manage_assembly()
        # update pipeline variables with results
        settings.set_step_number(results[0])
        files.set_input(absolute_path(results[1]))
        files.set_concatinated_output(absolute_path(results[2]))
        files.set_assembled_output(absolute_path(results[3]))
  
      if skip in 'annotation'and skip:
          skip_msg(skip)
      else:
          # init the annotation module
          anno = Annotation(settings.get_threads(), 
                            settings.get_step_number(),
                            settings.get_verbose(),
                            settings.get_actual_time(),
                            files.get_logdir(),
                            files.get_input(),
                            files.get_raw(),
                            settings.get_annotation(),
                            settings.get_use_contigs(),
                            exe.get_Blastn(),
                            exe.get_Blastn_DB(),
                            exe.get_Converter(),
                            files.get_blastn_dir(),
                            Blastn_Parameter(PARAM_FILE).outfmt,
                            parse_parameter(Blastn_Parameter(PARAM_FILE)),
                            exe.get_MetaCV(),
                            exe.get_MetaCV_DB(),
                            files.get_metacv_dir(),
                            MetaCV_Parameter(PARAM_FILE).get_seq(),
                            MetaCV_Parameter(PARAM_FILE).get_mode(),
                            MetaCV_Parameter(PARAM_FILE).get_orf(),
                            MetaCV_Parameter(PARAM_FILE).get_total_reads(),
                            MetaCV_Parameter(PARAM_FILE).get_min_qual(),
                            MetaCV_Parameter(PARAM_FILE).get_taxon(),
                            MetaCV_Parameter(PARAM_FILE).get_name())

          # run the annotation functions
          results = anno.manage_annotation()
          settings.set_step_number(results[0])
          files.set_blastn_output(absolute_path(results[1]))
          files.set_metacv_output(absolute_path(results[2]))
      
      if skip in 'analysis' and skip:
          skip_msg(skip)
      else:
          # init the analysis module
          analysis = Analysis(settings.get_threads(),
                              settings.get_step_number(),
                              settings.get_verbose(),
                              settings.get_actual_time(),
                              files.get_logdir(),
                              settings.get_annotation(),
                              files.get_output(),
                              files.get_parsed_db_dir(),
                              files.get_annotated_db_dir(),
                              files.get_subseted_db_dir(),
                              files.get_krona_report_dir(),
                              files.get_blastn_output(),
                              files.get_metacv_output(),
                              exe.get_Parser(), 
                              parse_parameter(blastParser_Parameter(PARAM_FILE)),
                              blastParser_Parameter(PARAM_FILE).get_name(),
                              exe.get_Annotate(),
                              parse_parameter(Rannotate_Parameter(PARAM_FILE)),
                              Rannotate_Parameter(PARAM_FILE).get_name(),
                              Rannotate_Parameter(PARAM_FILE).get_taxon_db(),
                              exe.get_Subset(),
                              subsetDB_Parameter(PARAM_FILE).get_bitscore(),
                              subsetDB_Parameter(PARAM_FILE).get_classifier(),
                              subsetDB_Parameter(PARAM_FILE).get_rank(),
                              subsetDB_Parameter(PARAM_FILE).get_taxon_db(),
                              exe.get_Krona_Blast(),
                              parse_parameter(Krona_Parameter(PARAM_FILE)),
                              Krona_Parameter(PARAM_FILE).get_name(),
                              settings.get_krona(),
                              exe.get_Perl_lib())
          # run the analysis function
          results = analysis.manage_analysis()
          files.set_parser_output(absolute_path(results[0]))
          files.set_annotated_output(absolute_path(results[1]))    
        
    except KeyboardInterrupt:

     sys.stdout.write('\nERROR 1 : Operation cancelled by User!\n')
     sys.exit(1)

    # print ending message
    print_verbose('\nPIPELINE COMPLETE!\n\n')
    print_running_time(settings.get_actual_time())
Пример #8
0
    def test_to_decoder_input(self):
        annotation = Annotation(["x", "y"], {})

        "func(x, y)"
        sequence = [
            Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr,
            Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE
        ]
        next_node_type, input = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(next_node_type, None)
        self.assertEqual(
            input.action.astype(np.int32).tolist(),
            [[0, 0, 0], [1, 0, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 0, 0],
             [0, 2, 0], [4, 0, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 1, 0],
             [0, 2, 0], [5, 0, 0], [2, 0, 0], [3, 0, 0], [0, 0, 1], [0, 2, 0]])
        self.assertEqual(
            input.action_type[:, 0].astype(np.int32).tolist(),
            [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0])
        self.assertEqual(
            input.action_type[:, 1].astype(np.int32).tolist(),
            [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1])
        self.assertEqual(
            input.action_type[:, 2].astype(np.int32).tolist(),
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0])
        self.assertEqual(
            input.node_type.astype(np.int32).tolist(),
            [0, 1, 2, 4, 5, 6, 6, 3, 2, 4, 5, 6, 6, 2, 4, 5, 6, 6])
        self.assertEqual(
            input.parent_action.astype(np.int32).tolist(),
            [0, 0, 1, 5, 2, 3, 3, 1, 4, 5, 2, 3, 3, 4, 5, 2, 3, 3])
        self.assertEqual(
            input.parent_index.astype(np.int32).tolist(),
            [-1, 0, 1, 2, 3, 4, 4, 1, 7, 8, 9, 10, 10, 7, 13, 14, 15, 15])

        "func(x,"
        sequence = [
            Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr,
            Name, Str, "x", CLOSE_NODE
        ]
        next_info, input = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(next_info, (expr, 7))

        "func(x, y)foo"
        sequence = [
            Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Expr,
            Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE, Name
        ]
        result = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(result, None)

        "func2(x, y)"
        sequence = [
            Root, Call, Expr, Name, Str, "func2", CLOSE_NODE, Expand2, Expr,
            Name, Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE
        ]
        result = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(result, None)

        "func(--, y)"
        sequence = [
            Root, Call, Expr, Name, Str, "func", CLOSE_NODE, Expand2, Name,
            Str, "x", CLOSE_NODE, Expr, Name, Str, "y", CLOSE_NODE
        ]
        result = to_decoder_input(sequence, annotation, grammar)
        self.assertEqual(result, None)

        ""
        result = to_decoder_input([], annotation, grammar)
        self.assertEqual(result[0], (ROOT, -1))

        "func("
        result = to_decoder_input(
            [Root, Call, Expr, Name, Str, "func", CLOSE_NODE], annotation,
            grammar)
        self.assertEqual(result[0], (expr_, 1))
        ""
        result = to_decoder_input([Root, Call], annotation, grammar)
        self.assertEqual(result[0], (expr, 1))
Пример #9
0
 def test_annotation(self):
     a = Annotation(["word1", "word2", "unknown"], {})
     word_to_id = {"word1": 1, "word2": 2, "<unknown>": 3}
     result = to_encoder_input(a, word_to_id)
     self.assertEqual(result.query.tolist(), [1, 2, 3])
Пример #10
0
    def build_annotations(self, subcorpus, lu, frame):
        """ Builds annotations from a subcorpus. """
        name = subcorpus.attrib['name']
        annotations = []
        for child in subcorpus.getchildren():
            if "metaphor" in child.attrib.keys():
                print(child.attrib)
            for c2 in child.getchildren():
                tag = c2.tag.replace(self.replace_tag, "")
                if tag == "text":
                    sentence = c2.text
                    #print(sentence)
                    #print("\n")
                elif tag == "annotationSet":
                    if len(c2.attrib.keys()) > 3:
                        print(c2.attrib)
                    status = c2.attrib['status']
                    ID = int(c2.attrib['ID'])
                    if status in ["MANUAL", "AUTO_EDITED"]:
                        new = Annotation(ID, status, sentence, name, lu, frame)

                        for c3 in c2.getchildren():
                            tag = c3.tag.replace(self.replace_tag, "")

                            if c3.attrib['name'] == "FE":
                                for c4 in c3.getchildren():
                                    tag = c4.tag.replace(self.replace_tag, "")
                                    name = c4.attrib[
                                        'name']  #.encode('utf-8') # Encode it, otherwise it breaks on Windows
                                    if 'start' and 'end' in c4.attrib:
                                        start, end = int(
                                            c4.attrib['start']), int(
                                                c4.attrib['end'])
                                        raw_text = new.sentence[
                                            start:end +
                                            1].encode('utf-8').decode('utf-8')
                                        new.add_fe_mapping(name, raw_text)
                                        new.set_spans(name, (start, end))
                                    else:
                                        new.add_fe_mapping(
                                            name, c4.attrib['itype'])
                            elif c3.attrib['name'] == "Sent":
                                for c4 in c3.getchildren():
                                    tag = c4.tag.replace(self.replace_tag, "")
                                    if c4.attrib['name'] == "Metaphor":
                                        new.set_metaphor()
                            elif c3.attrib['name'] == "Target":
                                for c4 in c3.getchildren():
                                    if c4.attrib['name'] == "Target":
                                        start, end = int(
                                            c4.attrib['start']), int(
                                                c4.attrib['end'])
                                        new.set_target(new.sentence[start:end +
                                                                    1])

                        annotations.append(new)
            #print(child.tag)
        return annotations
Пример #11
0
 def test_read_sprot(self):
     sprot_list = read_sprot(self.blast_file, self.gff_file, self.fasta_file)
     expected = [Annotation("g.4830", "name", "mrh4"), Annotation("m.4830", "product", "ATP-dependent RNA helicase mrh4, mitochondrial")]
     self.assertEquals(sprot_list, expected)