示例#1
0
 def set_reader(self, key=None, reader=None):
     """ *The key method to manipulate the vector configuration. Each call may
     configure one vector part specified by the key. If custome embedding is 
     selected as the vector type, a valid embedding data file with correct 
     format value (default is Word2Vec Text Format) must be provided.*
     
     :param int key: The key to be configured. 
     :param int vector_type: The type of vector the key should be.
     :param externalEmbeddingReader embedding_reader: For external embedding the reader object (see note).
     :return: Nothing.
     :raise noneValueError: If the key is none.
     :raise noneValueError: If the embedding reader is none.
     :raise KeyError: If the key does not exist in the vector configuration.
     :raise ValueError: If the vector_type is not valid.
     :raise TypeError: If the embedding reader object is not of valid type.
     
     .. Note::
         The externalEmbeddingReader class is a generic class that is needed
         to be used to define specific classes for the embedding reader objects.
         
     """
     if key == None:
         raise exp.noneValueError('Configuration key cannot be "None"')
     elif key not in self.vector_configuration.keys():
         raise KeyError(
             'The provided key doesnot exist.\nFound: {}'.format(key))
     elif reader == None:
         raise exp.noneValueError('The reader object cannot be "None"')
     elif not isinstance(reader, base.fileReader):
         raise TypeError(
             'The reader must be a fileReader object.\nFound: {}'.format(
                 type(reader)))
     else:
         self.vector_configuration[key] = reader
示例#2
0
 def setValue(self, annotation_key=None, value=None):
     if annotation_key == None:
         raise exp.noneValueError('Annotation Key cannot be "None"')
     elif annotation_key not in self.getAnnotationKeyList():
         raise KeyError(
             'Invalid key passed.\nFound: {}'.format(annotation_key))
     elif value == None:
         raise exp.noneValueError('Annotation value cannot be "None"')
     self.annotation_map[annotation_key] = value
示例#3
0
    def get_vector(self, sentence_id=None, token_id=None):
        if sentence_id == None:
            raise exp.noneValueError('Sentence ID cannot be "None"')
        elif token_id == None:
            raise exp.noneValueError('Token ID cannot be "None"')
        elif sentence_id not in self.sentence_map.keys():
            raise KeyError(
                'The provided sentence ID doesnot exist.\nFound: {}'.format(
                    sentence_id))

        elif token_id not in self.sentence_map.get(sentence_id).keys():
            raise KeyError(
                'The provided token ID doesnot exist.\nFound: {}'.format(
                    token_id))
        else:
            return self.sentence_map.get(sentence_id).get(token_id)
示例#4
0
 def set_one_hot_reader(self, key=None, dimension_multiplier=1.3):
     if key == None:
         raise exp.noneValueError('Configuration key cannot be "None"')
     elif key not in self.vector_configuration.keys():
         raise KeyError(
             'The provided key doesnot exist.\nFound: {}'.format(key))
     else:
         if dimension_multiplier == None:
             dimension_multiplier = 1.3
         elif dimension_multiplier < 1.0:
             raise exp.smallerValueError(
                 'Dimension multiplier cannot be smaller than 1.0.\nFound: {}'
                 .format(dimension_multiplier))
         key_elements = self.file_reader.get_key_elements(key=key)
         if isinstance(key_elements, list):
             self.vector_configuration[key] = listOneHotVectorReader(
                 key_elements, dimension_multiplier)
         elif isinstance(key_elements, dict):
             self.vector_configuration[
                 key] = classMembersOneHotVectorReader(
                     key_elements, dimension_multiplier)
         else:
             raise TypeError(
                 'Invalid Key Element data type found, expected list or dict.\nFound: {}'
                 .format(type(key_elements)))
示例#5
0
 def updateReader(self, update_elements=None):
     if update_elements == None:
         raise exp.noneValueError(
             'Additional class list map cannot be "None"')
     elif not isinstance(update_elements, dict):
         raise TypeError(
             'Additional class list map must be a dict object.\nFound: <{}>'
             .format(type(update_elements)))
     elif not len(update_elements):
         raise exp.zeroLengthValueError(
             'Additional class list map cannot be empty.')
     else:
         for k in update_elements.keys():
             if k not in self.classes:
                 self.classes.append(k)
                 self.elements[k] = update_elements.get(k)
                 self.dimension[k] = len(update_elements.get(k))
             else:
                 self.elements[k] = self.elements.get(k, [])
                 self.elements[k].extend([
                     e for e in update_elements.get(k)
                     if e not in self.elements[k]
                 ])
                 if len(self.elements.get(k)) > self.dimension.get(k):
                     raise exp.greaterValueError(
                         'Exceeding vector dimension limit for the class: {}.\n{}(old):{}(new)'
                         .format(k, self.dimension, len(self.elements)))
示例#6
0
 def __init__(self, class_list_map=None, dimension_multiplier=1.3):
     # test element list ---------------------------------------------------
     if class_list_map == None:
         raise exp.noneValueError('The class list map cannot be "None"')
     elif not isinstance(class_list_map, dict):
         raise TypeError(
             'The class list map must be a dict object.\nFound: <{}>'.
             format(type(class_list_map)))
     elif not len(class_list_map):
         raise exp.zeroLengthValueError(
             'The class list map cannot be empty.')
     else:
         self.elements = class_list_map
         self.classes = class_list_map.keys()
     # test dimension ------------------------------------------------------
     if dimension_multiplier == None:
         dimension_multiplier = 1.3
     elif dimension_multiplier < 1.0:
         raise exp.smallerValueError(
             'Dimension multiplier cannot be smaller than 1.\nFound: {}'.
             format(dimension_multiplier))
     self.dimension = {
         k: int(round(len(self.elements.get(k)) * dimension_multiplier))
         for k in sorted(self.elements.keys())
     }
示例#7
0
 def get_vector(self, key=None):
     if key == None:
         raise exp.noneValueError('Vector search key cannot be "None".')
     try:
         vpos = self.vector_map.get(key)
     except KeyError:
         return npzeros(self.getDimension())
     self.file_pointer.seek(vpos)
     return nparray(
         self.file_pointer.readline().strip().split()[1:].strip())
示例#8
0
 def __init__(self, file_reader=None, vector_config=None, window_width=10):
     if file_reader == None:
         raise exp.noneValueError('File reader cannot be "None"')
     elif not isinstance(file_reader, base.fileReader):
         raise TypeError(
             'File reader must be a fileReader onject.\nFound: {}'.format(
                 type(file_reader)))
     else:
         self.input_reader = file_reader
         # initiate a vector reader using the file reader object
         self.vector_reader = vec.CoNLLFileVector(file_reader)
         if self.vector_reader == None:
             raise exp.noneValueError('File vector reader cannot be "None"')
         elif not isinstance(self.vector_reader, base.fileVectorReader):
             raise TypeError(
                 'File vector reader must be a fileVectorReader onject.\nFound: {}'
                 .format(type(self.vector_reader)))
         # configure file vector reader by setting specific vector readers
         if vector_config == None:
             self.vector_reader.set_one_hot_reader(utils.TOKEN)
             self.vector_reader.set_one_hot_reader(utils.GPOS)
         else:
             if not isinstance(vector_config, dict):
                 raise TypeError(
                     'Vector configuration must be a dictionary.\nFound: {}'
                     .format(type(vector_config)))
             else:
                 for k in vector_config.keys():
                     self.vector_reader.set_reader(k, vector_config.get(k))
         #self.vector_reader.vectorize()
     if window_width < 2:
         raise ValueError(
             'Window width cannot be less than {}.\nFound: {}'.format(
                 2, window_width))
     else:
         self.window_width = window_width
     self.relation_reader = vec.listOneHotVectorReader(
         element_list=self.input_reader.get_key_elements(
             key=utils.RELATION))
     self.input_data_matrix = {}
     self.output_data_matrix = {}
     self.__populate_data_metrix()
示例#9
0
 def get_vector(self, key=None):
     if key == None:
         raise exp.noneValueError('Vector search key cannot be "None"')
     try:
         vpos = self.elements.index(key)
     except KeyError:
         raise KeyError(
             'Key doesnot exist in the vocabulary.\nFound: {}'.format(key))
     vector = npzeros(self.dimension)
     vector[vpos] = 1.0
     return vector
示例#10
0
 def set_current_sentence(self, sentence_id=None):
     if sentence_id == None:
         raise exp.noneValueError('Sentence ID cannot be "None"')
     elif not isinstance(sentence_id, int):
         raise TypeError('Sentence ID must be a integer.\nFound: {}'.format(
             sentence_id))
     elif 1 <= sentence_id <= self.metadata.get_sentence_count():
         self.current_sentence = sentence_id
     else:
         raise ValueError(
             'Sentence ID must be between {} and {}.\nFound: {}'.format(
                 1, self.metadata.get_sentence_count(), sentence_id))
示例#11
0
 def generate_vector(self, sentence_id=None):
     curSentID = self.file_reader.get_current_sentence_id()
     if all([e == None for e in self.vector_configuration.values()]):
         raise exp.noneValueError(
             'All values of vector configuration is "None"')
     elif any([
             e != None and not isinstance(e, base.vectorReader)
             for e in self.vector_configuration.values()
     ]):
         raise TypeError(
             'Invalid vector configuration value found.\nFound: {}'.format(
                 self.vector_configuration))
     vectorKeys = sorted([
         k for k in self.vector_configuration.keys()
         if self.vector_configuration.get(k) != None
     ])
     self.file_reader.set_current_sentence(sentence_id)
     curSentence = self.file_reader.get_current_sentence()
     if curSentence == None:
         raise exp.noneValueError('Sentence cannot be "None"')
     elif not isinstance(curSentence, list):
         raise TypeError('Sentence must be a list.\nFound: {}'.format(
             type(curSentence)))
     elif not all(
         [isinstance(e, base.annotatedString) for e in curSentence]):
         raise TypeError(
             'Sentence must be a list of "annotatedString".\nFound: {}',
             [type(e) for e in curSentence])
     curSentenceMap = {}
     for tok in curSentence:
         vector_list = []
         for key in vectorKeys:
             vector_list.append(
                 self.vector_configuration.get(key).get_vector(
                     tok.getValue(key)))
         curSentenceMap[tok.getValue(utils.TID)] = npcat(vector_list)
     if self.vector_dimension == None:
         self.vector_dimension = len(curSentenceMap[1])
     self.file_reader.set_current_sentence(curSentID)
     return curSentenceMap
示例#12
0
 def vectorize(self):
     self.file_reader.reset()
     if all([e == None for e in self.vector_configuration.values()]):
         raise exp.noneValueError(
             'All values of vector configuration is "None"')
     elif any([
             e != None and not isinstance(e, base.vectorReader)
             for e in self.vector_configuration.values()
     ]):
         raise TypeError(
             'Invalid vector configuration value found.\nFound: {}'.format(
                 self.vector_configuration))
     vectorKeys = sorted([
         k for k in self.vector_configuration.keys()
         if self.vector_configuration.get(k) != None
     ])
     curSentence = self.file_reader.get_current_sentence()
     while True:
         if curSentence == None:
             raise exp.noneValueError('Sentence cannot be "None"')
         elif not isinstance(curSentence, list):
             raise TypeError('Sentence must be a list')
         elif not all(
             [isinstance(e, base.annotatedString) for e in curSentence]):
             raise TypeError('Sentence must be a list of "annotatedString"')
         curSentenceMap = {}
         for tok in curSentence:
             vector_list = []
             for key in vectorKeys:
                 vector_list.append(
                     self.vector_configuration.get(key).get_vector(
                         tok.getValue(key)))
             curSentenceMap[tok.getValue(utils.TID)] = npcat(vector_list)
         self.sentence_map[
             self.file_reader.get_current_sentence_id()] = curSentenceMap
         try:
             curSentence = self.file_reader.get_next_sentence()
         except exp.lastElementWarning:
             break
示例#13
0
 def __init__(self, token=None, token_def=utils.CONLL_TOKEN_DEFINITION):
     if token == None:
         raise exp.noneValueError('Token cannot be "None"')
     elif not isinstance(token, list):
         raise TypeError('Token must be a list.\nFound: {}'.format(
             type(token)))
     elif token_def == None:
         raise exp.noneValueError('Token Definition cannot be "None"')
     elif not isinstance(token_def, list):
         raise TypeError(
             'Token Definition must be a list.\nFound: {}'.format(
                 type(token_def)))
     elif len(token) != len(token_def):
         raise exp.unequalValueError(
             'Token list size doesnot match the definition list size.\n{}(Token):{}(Definition)'
             .format(len(token), len(token_def)))
     else:
         self.annotation_map = {
             token_def[i]: int(token[i]) if token_def[i]
             in [utils.TID, utils.RELATION_HEAD] else token[i].lower()
             for i in range(len(token)) if token_def[i] != utils.NOT_IN_USE
         }
示例#14
0
 def get_dataset(self, **kwargs):
     retMap = {}
     if len(kwargs) == 0:
         kwargs['all'] = 1.0
         retMap['all'] = [[], []]
     else:
         for key, value in kwargs.items():
             if value == None:
                 raise exp.noneValueError(
                     'Dataset split value cannot be "None"')
             elif not isinstance(value, float):
                 raise TypeError(
                     'Dataset split value must be of numaric type./nFound: {}'
                     .format(type(value)))
             elif value < 0:
                 raise exp.smallerValueError(
                     'Dataset split value cannot be less than 0.\nFound: {}'
                     .format(value))
             elif value > 1.0:
                 raise exp.greaterValueError(
                     'Dataset split value cannot be greater than 1.\nFound: {}'
                     .format(value))
             else:
                 retMap[key] = [[], []]
         if sum(kwargs.values()) != 1.0:
             raise ValueError(
                 'The sum of dataset split valuse must be equal to 1.0.\nFound: {}'
                 .format(sum(kwargs.values())))
     elements = self.__get_datapoint_index_list()
     splitCountMap = {k: int(len(elements) * v) for k, v in kwargs.items()}
     # debug
     if sum(splitCountMap.values()) < len(elements):
         print '>>>DEBUG: Real Total (', len(
             elements), ') :: Split Total (', sum(
                 splitCountMap.values()), ')'
     for key, value in splitCountMap.items():
         if len(elements) < value:
             value = len(elements)
         for i in range(value):
             cIndex = elements.pop(randint(0, len(elements) - 1))
             retMap[key][0].append(self.input_data_matrix.get(cIndex))
             retMap[key][1].append(self.output_data_matrix.get(cIndex))
     if len(elements):
         key = splitCountMap.keys()[randint(0,
                                            len(splitCountMap.keys()) - 1)]
         for e in elements:
             retMap[key][0].append(self.input_data_matrix.get(e))
             retMap[key][1].append(self.output_data_matrix.get(e))
     return retMap
示例#15
0
 def updateReader(self, update_elements=None):
     if update_elements == None:
         raise exp.noneValueError(
             'Additional element list cannot be "None"')
     elif not isinstance(update_elements, list):
         raise TypeError(
             'Additional element list must be a list object.\nFound: <{}>'.
             format(type(update_elements)))
     elif not len(update_elements):
         raise exp.zeroLengthValueError(
             'Additional element list cannot be empty.')
     else:
         # filter elements and add them to the elements list
         self.elements.extend(
             [e for e in update_elements if e not in self.elements])
         if len(self.elements) > self.dimension:
             raise exp.greaterValueError(
                 'Exceeding vector dimension limit.\n{}(old):{}(new)'.
                 format(self.dimension, len(self.elements)))
示例#16
0
 def __init__(self, element_list=None, dimension_multiplier=1.3):
     # test element list ---------------------------------------------------
     if element_list == None:
         raise exp.noneValueError('Element list cannot be "None"')
     elif not isinstance(element_list, list):
         raise TypeError(
             'Element list must be a list object.\nFound: <{}>'.format(
                 type(element_list)))
     elif not len(element_list):
         raise exp.zeroLengthValueError('Element list cannot be empty.')
     else:
         self.elements = element_list
     # test dimension ------------------------------------------------------
     if dimension_multiplier == None:
         dimension_multiplier = 1.3
     elif dimension_multiplier < 1.0:
         raise exp.smallerValueError(
             'Dimension multiplier cannot be smaller than 1.\nFound: {}'.
             format(dimension_multiplier))
     self.dimension = int(round(len(self.elements) * dimension_multiplier))
示例#17
0
 def get_key_elements(self, key=None):
     if key == None:
         raise exp.noneValueError('Element key cannot be "None"')
     elif not isinstance(key, int):
         raise TypeError('Configuration key must be int.\nFound: {}'.format(
             type(key)))
     elif key == utils.TOKEN:
         return self.metadata.get_token_list()
     elif key == utils.LEMMA:
         return self.metadata.get_lemma_list()
     elif key == utils.GPOS:
         return self.metadata.get_generic_pos_list()
     elif key == utils.POS:
         return self.metadata.get_pos_list()
     elif key == utils.MORPH:
         return self.metadata.get_morphological_class_value_map()
     elif key == utils.RELATION:
         return self.metadata.get_relation_list()
     else:
         raise KeyError('Unidentified key detected.\nFound: {}'.format(key))
示例#18
0
 def __init__(self, file_reader=None):
     # test reader ---------------------------------------------------------
     if file_reader == None:
         raise exp.noneValueError('Data file reader cannot be "None"')
     elif not isinstance(file_reader, base.fileReader):
         raise TypeError(
             'Reader must be a fileReader object.\nFound: <{}>'.format(
                 type(file_reader)))
     else:
         self.file_reader = file_reader
     # initiate base configuration -----------------------------------------
     self.vector_configuration = {
         utils.TOKEN: None,
         utils.LEMMA: None,
         utils.GPOS: None,
         utils.POS: None,
         utils.MORPH: None
     }
     # by default no embeddings shall be used ------------------------------
     self.sentence_map = {}
     self.vector_dimension = None
示例#19
0
 def get_vector(self, key=None):
     if key == None:
         raise exp.noneValueError('Vector search key cannot be "None"')
     elif not isinstance(key, dict):
         raise TypeError(
             'Vector search key must be a dict object.\nFound: <{}>'.format(
                 type(key)))
     for k in key.keys():
         if k not in self.classes:
             raise KeyError(
                 'Class doesnot exist in the vocabulary.\nFound: {}'.format(
                     k))
         elif key.get(k) not in self.elements.get(k):
             raise KeyError(
                 'A value for the class::{} doesnot exist.\nFound: {}'.
                 format(k, key.get(k)))
     vectorList = []
     for c in self.classes:
         vectorPart = npzeros(self.dimension.get(c))
         if c in key.keys():
             vectorPart[self.elements.get(c).index(key.get(c))] = 1.0
         vectorList.append(vectorPart)
     return npcat(vectorList)
示例#20
0
def doesTheFileExist(
        file_path=None):  # - DEF::START --------------------------
    """ Method to check if file path (*file_path*) is correct and the file 
    actually exists.
    
    :param str file_path: The full path of the file to be tested.
    :return: True.
    :rtype: bool
    :raise noneFilePathError: If the the file path is None.
    :raise nonStringFilePathError: If the the file path is not a String.
    :raise zeroLengthStringPathError: If the the file path is a zero length String.
    :raise pathDoesNotExistError: If file path doesnot exist.
    :raise newFileIOError: The directory exists but the file does not.
    :raise pathIsDirectoryException: If the path is a directory.

    >>> doesTheFileExist(file_path='/my/own/path/fakedata.conll')
    True
    """
    if file_path == None:
        raise exp.noneValueError('None was passed as file path')
    elif not isinstance(file_path, basestring):
        raise TypeError('File path is not a string.\nFound: <{}>'.format(
            type(file_path)))
    elif not len(file_path):
        raise exp.zeroLengthValueError('Empty string was passes as file path')
    elif not os.path.exists(file_path):
        if os.path.isdir(file_path):
            raise exp.pathTypeIOError(
                'Path is not a file\nFound: <{}>'.format(file_path))
        elif os.path.isdir(os.path.dirname(file_path)):
            raise exp.newFileIOError(
                'Found parent directory but the file deos not exist')
        raise exp.invalidPathIOError(
            'Path does not exist.\nFound: <{}>'.format(file_path))

    return True
示例#21
0
 def __init__(self,
              input_file=None,
              meta_file=None,
              save_meta=True):  # DEF:: START --------
     # check input file path -----------------------------------------------
     try:
         utils.doesTheFileExist(file_path=input_file)
     except:
         raise IOError('Input File I/O error.')
     # split the file path and the file name -------------------------------
     head, tail = os.path.split(input_file)
     # class variables -----------------------------------------------------
     self.file_location = head  #-------------------------------------------- input file path
     self.file_name, self.file_extension = os.path.splitext(
         tail)  #--------- input file name and extension
     self.file_hash_value = None  #------------------------------------------ hash value of the input file (for change in file monitoring)
     self.sentence_configuration = {
     }  #------------------------------------- sentence number to file pointer offset map (starting with sentence 0)
     self.token_distribution_map = {}
     self.lemma_distribution_map = {}
     self.gpos_distribution_map = {}
     self.pos_distribution_map = {}
     self.morphology_distribution_map = {}
     self.relation_distribution_map = {}
     # slelct the metadata file ... either the provided path or local path -
     skip_loading = False
     try:
         utils.doesTheFileExist(file_path=meta_file)
     except exp.newFileIOError:
         skip_loading = True
     except (TypeError, ValueError, exp.invalidPathIOError):
         meta_file = self.file_location + '/' + self.file_name + utils.META_EXTENSION
     except exp.pathTypeIOError:
         meta_file = meta_file + '/' + self.file_name + utils.META_EXTENSION
     # analyze or load metadata --------------------------------------------
     try:
         if skip_loading:
             raise exp.skipStepWarning
         # attempting to load metadata -------------------------------------
         self.load_metadate(
             meta_file=meta_file,
             current_hash=utils.generate_hash(source_file=input_file))
     except Warning:
         print >> sys.stderr, 'WARNING: running analysis ...'
         print >> sys.stderr, 'Metadata: {}'.format(meta_file)
     except StandardError as e:
         print >> sys.stderr, 'WARNING: Failed loading metadata file ... running analysis.'
         print >> sys.stderr, e
     # running analysis ----------------------------------------------------
     self.analyze(in_file=input_file)
     # saving metadata -----------------------------------------------------
     if save_meta == None:
         raise exp.noneValueError('save_meta option flag cnnot be "None"')
     elif not isinstance(save_meta, bool):
         raise TypeError(
             'save_meta option flag must be bool type.\nFound: <{}>'.format(
                 type(save_meta)))
     elif save_meta:
         try:
             self.save_metadata(meta_file=meta_file)
         except Exception as e:
             print >> sys.stderr, 'WARNING: Metadata file not saved... re-analysis will be needed next time.'
             print >> sys.stderr, e