예제 #1
0
    def build_seqsinfo_from_seqs(self, seqs):
        """prepares and process sequences to disk and return info dictionary about the parsed sequences
        
          Args:
               seqs: list of sequences that are instances of :class:`SequenceStruct` class
               
        """
        # create working directory 
        corpus_name = "reference_corpus"
        working_dir = create_directory("working_dir", self.root_dir)
        self.working_dir = working_dir
        unique_id = True
        seq_representer = self.seq_representer
        # create sequence dictionary mapping each sequence to a unique id
        seqs_dict = {}
        counter = 1
        for seq in seqs:
            if(hasattr(seq, 'id')):
                seq_id = seq.id
            else:
                seq_id = counter 
            seqs_dict[seq_id] = seq
            counter+=1

        # get the seqs_info representing the information about the parsed sequences
        seqs_info = seq_representer.prepare_seqs(seqs_dict, corpus_name, working_dir, unique_id)
        return(seqs_info) 
예제 #2
0
    def build_seqsinfo_from_seqfile(self, seq_file, data_parser_options, num_seqs=numpy.inf):
        """prepares and process sequences to disk and return info dictionary about the parsed sequences
        
          Args:
               seq_file: string representing the path to the sequence file
               data_parser_options: dictionary containing options to be passed
                                    to :func:`read_file` method of :class:`DataFileParser` class
               num_seqs: integer, maximum number of sequences to read from file
                         (default numpy.inf -- means read all file)

        """
        seq_representer = self.seq_representer
        # create working directory 
        corpus_name = "reference_corpus_" + generate_datetime_str()
        working_dir = create_directory("working_dir", self.root_dir)
        self.working_dir = working_dir
        unique_id = False
        # build the seqs_info by parsing the sequences from file iteratively
        seqs_info = {}
        counter=1
        dparser = DataFileParser()
        for seq in self.get_seqs_from_file(seq_file, dparser, data_parser_options):
            if(hasattr(seq, 'id')):
                seq_id = seq.id
            else:
                seq_id = counter
            seqs_info.update(seq_representer.prepare_seqs({seq_id:seq}, corpus_name, working_dir, unique_id, log_progress=False))
            print("{} sequences have been processed".format(counter))
            if(counter>=num_seqs):
                break
            counter+=1    

        return(seqs_info)
예제 #3
0
 def build_seqsinfo(self, seq_file):
     seq_representer = self.seq_representer
     # create working directory
     corpus_name = "reference_corpus_" + generate_datetime_str()
     working_dir = create_directory("working_dir", self.root_dir)
     self.working_dir = working_dir
     unique_id = False
     # build the seqs_info by parsing the sequences from file iteratively
     seqs_info = {}
     counter = 1
     for seq in self.get_seqs_from_file(seq_file):
         if hasattr(seq, "id"):
             seq_id = seq.id
         else:
             seq_id = counter
         seqs_info.update(
             seq_representer.prepare_seqs(
                 {seq_id: seq},
                 corpus_name,
                 working_dir,
                 unique_id,
                 log_progress=False,
             )
         )
         print("{} sequences have been processed".format(counter))
         counter += 1
     return seqs_info
예제 #4
0
def build_model(model_type, template_config, num_seqs):
    if (model_type == 'HOCRFAD'):
        modelrepr_class = HOCRFADModelRepresentation
        model_class = HOCRFAD
        fextractor_class = HOFeatureExtractor
    elif (model_type == 'HOCRF'):
        modelrepr_class = HOCRFModelRepresentation
        model_class = HOCRF
        fextractor_class = HOFeatureExtractor
    elif (model_type == 'HOSemiCRFAD'):
        modelrepr_class = HOSemiCRFADModelRepresentation
        model_class = HOSemiCRFAD
        fextractor_class = HOFeatureExtractor
    elif (model_type == 'HOSemiCRF'):
        modelrepr_class = HOSemiCRFModelRepresentation
        model_class = HOSemiCRF
        fextractor_class = HOFeatureExtractor
    elif (model_type == 'FirstOrderCRF'):
        modelrepr_class = FirstOrderCRFModelRepresentation
        model_class = FirstOrderCRF
        fextractor_class = FOFeatureExtractor

    # init attribute extractor
    attr_extractor = AttributeExtractor()
    # load templates
    template_XY, template_Y = template_config()
    # init feature extractor
    fextractor = fextractor_class(template_XY, template_Y,
                                  attr_extractor.attr_desc)
    # generate data
    seqs = seq_generator.generate_seqs(num_seqs)
    # use all passed data as training data -- no splitting
    data_split_options = {'method': 'none'}
    # no feature filter
    fe_filter = None
    working_dir = create_directory('wd', current_dir)
    workflow = GenericTrainingWorkflow(attr_extractor, fextractor, fe_filter,
                                       modelrepr_class, model_class,
                                       working_dir)

    # since we are going to train using perceptron based methods
    full_parsing = True

    data_split = workflow.seq_parsing_workflow(data_split_options,
                                               seqs=seqs,
                                               full_parsing=full_parsing)

    # build and return a CRFs model
    # folder name will be f_0 as fold 0
    trainseqs_id = data_split[0]['train']
    crf_m = workflow.build_crf_model(trainseqs_id,
                                     "f_0",
                                     full_parsing=full_parsing)

    return (workflow, crf_m, data_split)
예제 #5
0
    def seq_parsing_workflow(self, seqs, split_options):
        """preparing sequences to be used in the learning framework"""

        # create working directory
        corpus_name = "reference_corpus"
        working_dir = create_directory("working_dir", self.root_dir)
        self.working_dir = working_dir
        unique_id = True

        # create sequence dictionary mapping each sequence to a unique id
        seqs_dict = {i + 1: seqs[i] for i in range(len(seqs))}
        seqs_id = list(seqs_dict.keys())
        self.seqs_id = seqs_id

        # initialize attribute extractor
        attr_extractor = self.aextractor_class()

        # create the feature extractor
        scaling_method = self.scaling_method
        fextractor_class = self.fextractor_class
        f_extractor = fextractor_class(
            self.template_xy, self.template_y, attr_extractor.attr_desc
        )

        # create sequence representer
        seq_representer = SeqsRepresenter(attr_extractor, f_extractor)

        # get the seqs_info representing the information about the parsed sequences
        seqs_info = seq_representer.prepare_seqs(
            seqs_dict, corpus_name, working_dir, unique_id
        )

        # preporcess and generate attributes in case of segments with length >1 or in case of scaling of
        # attributes is needed
        seq_representer.preprocess_attributes(seqs_id, seqs_info, method=scaling_method)

        # extract global features F(X,Y)
        percep_training = False
        if self.optimization_options["method"] in {"COLLINS-PERCEPTRON", "SAPO"}:
            percep_training = True
        seq_representer.extract_seqs_globalfeatures(seqs_id, seqs_info, percep_training)

        # save the link to seqs_info and seq_representer as instance variables
        self.seqs_info = seqs_info
        self.seq_representer = seq_representer

        # split dataset according to the specified split options
        data_split = self.split_dataset(seqs_info, split_options)

        # save the datasplit dictionary on disk
        gfeatures_dir = seqs_info[1]["globalfeatures_dir"]
        ref_corpusdir = os.path.dirname(os.path.dirname(gfeatures_dir))
        ReaderWriter.dump_data(data_split, os.path.join(ref_corpusdir, "data_split"))
        return data_split
예제 #6
0
def profile_test(model_type, scaling_method, optimization_options, run_config,
                 test_type):
    import cProfile
    local_def = {
        'model_type': model_type,
        'scaling_method': scaling_method,
        'optimization_options': optimization_options,
        'run_config': run_config,
        'test_type': test_type
    }
    global_def = {'test_crfs': test_crfs}
    profiling_dir = create_directory('profiling', root_dir)
    cProfile.runctx(
        'test_crfs(model_type, scaling_method, optimization_options, run_config, test_type)',
        global_def,
        local_def,
        filename=os.path.join(profiling_dir, "profile_out"))
예제 #7
0
    def test_workflow(self, seqs):
        """ testing scenarios of mixing different templates
        """
        corpus_name = "reference_corpus"
        working_dir = create_directory("working_dir", self.root_dir)
        self._working_dir = working_dir
        unique_id = True
        seqs_dict = {}
        templateY = self.template_Y
        templateXY = self.template_XY
        modelrepr_class = self.model_repr_class
        model_class = self.model_class
        fextractor_class = self.fextractor_class
        scaling_method = self.scaling_method

        attr_extractor = NERSegmentAttributeExtractor()
        f_extractor = fextractor_class(templateXY, templateY,
                                       attr_extractor.attr_desc)
        seq_representer = SeqsRepresenter(attr_extractor, f_extractor)
        for i in range(len(seqs)):
            seqs_dict[i + 1] = deepcopy(seqs[i - 1])
        seqs_info = seq_representer.prepare_seqs(seqs_dict, corpus_name,
                                                 working_dir, unique_id)
        seqs_id = list(seqs_info.keys())

        seq_representer.preprocess_attributes(seqs_id,
                                              seqs_info,
                                              method=scaling_method)
        seq_representer.extract_seqs_globalfeatures(seqs_id, seqs_info)
        model = seq_representer.create_model(seqs_id, seqs_info,
                                             modelrepr_class, self.filter_obj)
        seq_representer.extract_seqs_modelactivefeatures(seqs_id,
                                                         seqs_info,
                                                         model,
                                                         "",
                                                         learning=True)
        crf_model = model_class(model, seq_representer, seqs_info)

        self._seq_representer = seq_representer
        self._seqs_id = seqs_id
        self._seqs_info = seqs_info
        self._crf_model = crf_model
        self._model = model