def set_reader(self, key=None, reader=None): """ *The key method to manipulate the vector configuration. Each call may configure one vector part specified by the key. If custome embedding is selected as the vector type, a valid embedding data file with correct format value (default is Word2Vec Text Format) must be provided.* :param int key: The key to be configured. :param int vector_type: The type of vector the key should be. :param externalEmbeddingReader embedding_reader: For external embedding the reader object (see note). :return: Nothing. :raise noneValueError: If the key is none. :raise noneValueError: If the embedding reader is none. :raise KeyError: If the key does not exist in the vector configuration. :raise ValueError: If the vector_type is not valid. :raise TypeError: If the embedding reader object is not of valid type. .. Note:: The externalEmbeddingReader class is a generic class that is needed to be used to define specific classes for the embedding reader objects. """ if key == None: raise exp.noneValueError('Configuration key cannot be "None"') elif key not in self.vector_configuration.keys(): raise KeyError( 'The provided key doesnot exist.\nFound: {}'.format(key)) elif reader == None: raise exp.noneValueError('The reader object cannot be "None"') elif not isinstance(reader, base.fileReader): raise TypeError( 'The reader must be a fileReader object.\nFound: {}'.format( type(reader))) else: self.vector_configuration[key] = reader
def setValue(self, annotation_key=None, value=None): if annotation_key == None: raise exp.noneValueError('Annotation Key cannot be "None"') elif annotation_key not in self.getAnnotationKeyList(): raise KeyError( 'Invalid key passed.\nFound: {}'.format(annotation_key)) elif value == None: raise exp.noneValueError('Annotation value cannot be "None"') self.annotation_map[annotation_key] = value
def get_vector(self, sentence_id=None, token_id=None): if sentence_id == None: raise exp.noneValueError('Sentence ID cannot be "None"') elif token_id == None: raise exp.noneValueError('Token ID cannot be "None"') elif sentence_id not in self.sentence_map.keys(): raise KeyError( 'The provided sentence ID doesnot exist.\nFound: {}'.format( sentence_id)) elif token_id not in self.sentence_map.get(sentence_id).keys(): raise KeyError( 'The provided token ID doesnot exist.\nFound: {}'.format( token_id)) else: return self.sentence_map.get(sentence_id).get(token_id)
def set_one_hot_reader(self, key=None, dimension_multiplier=1.3): if key == None: raise exp.noneValueError('Configuration key cannot be "None"') elif key not in self.vector_configuration.keys(): raise KeyError( 'The provided key doesnot exist.\nFound: {}'.format(key)) else: if dimension_multiplier == None: dimension_multiplier = 1.3 elif dimension_multiplier < 1.0: raise exp.smallerValueError( 'Dimension multiplier cannot be smaller than 1.0.\nFound: {}' .format(dimension_multiplier)) key_elements = self.file_reader.get_key_elements(key=key) if isinstance(key_elements, list): self.vector_configuration[key] = listOneHotVectorReader( key_elements, dimension_multiplier) elif isinstance(key_elements, dict): self.vector_configuration[ key] = classMembersOneHotVectorReader( key_elements, dimension_multiplier) else: raise TypeError( 'Invalid Key Element data type found, expected list or dict.\nFound: {}' .format(type(key_elements)))
def updateReader(self, update_elements=None): if update_elements == None: raise exp.noneValueError( 'Additional class list map cannot be "None"') elif not isinstance(update_elements, dict): raise TypeError( 'Additional class list map must be a dict object.\nFound: <{}>' .format(type(update_elements))) elif not len(update_elements): raise exp.zeroLengthValueError( 'Additional class list map cannot be empty.') else: for k in update_elements.keys(): if k not in self.classes: self.classes.append(k) self.elements[k] = update_elements.get(k) self.dimension[k] = len(update_elements.get(k)) else: self.elements[k] = self.elements.get(k, []) self.elements[k].extend([ e for e in update_elements.get(k) if e not in self.elements[k] ]) if len(self.elements.get(k)) > self.dimension.get(k): raise exp.greaterValueError( 'Exceeding vector dimension limit for the class: {}.\n{}(old):{}(new)' .format(k, self.dimension, len(self.elements)))
def __init__(self, class_list_map=None, dimension_multiplier=1.3): # test element list --------------------------------------------------- if class_list_map == None: raise exp.noneValueError('The class list map cannot be "None"') elif not isinstance(class_list_map, dict): raise TypeError( 'The class list map must be a dict object.\nFound: <{}>'. format(type(class_list_map))) elif not len(class_list_map): raise exp.zeroLengthValueError( 'The class list map cannot be empty.') else: self.elements = class_list_map self.classes = class_list_map.keys() # test dimension ------------------------------------------------------ if dimension_multiplier == None: dimension_multiplier = 1.3 elif dimension_multiplier < 1.0: raise exp.smallerValueError( 'Dimension multiplier cannot be smaller than 1.\nFound: {}'. format(dimension_multiplier)) self.dimension = { k: int(round(len(self.elements.get(k)) * dimension_multiplier)) for k in sorted(self.elements.keys()) }
def get_vector(self, key=None): if key == None: raise exp.noneValueError('Vector search key cannot be "None".') try: vpos = self.vector_map.get(key) except KeyError: return npzeros(self.getDimension()) self.file_pointer.seek(vpos) return nparray( self.file_pointer.readline().strip().split()[1:].strip())
def __init__(self, file_reader=None, vector_config=None, window_width=10): if file_reader == None: raise exp.noneValueError('File reader cannot be "None"') elif not isinstance(file_reader, base.fileReader): raise TypeError( 'File reader must be a fileReader onject.\nFound: {}'.format( type(file_reader))) else: self.input_reader = file_reader # initiate a vector reader using the file reader object self.vector_reader = vec.CoNLLFileVector(file_reader) if self.vector_reader == None: raise exp.noneValueError('File vector reader cannot be "None"') elif not isinstance(self.vector_reader, base.fileVectorReader): raise TypeError( 'File vector reader must be a fileVectorReader onject.\nFound: {}' .format(type(self.vector_reader))) # configure file vector reader by setting specific vector readers if vector_config == None: self.vector_reader.set_one_hot_reader(utils.TOKEN) self.vector_reader.set_one_hot_reader(utils.GPOS) else: if not isinstance(vector_config, dict): raise TypeError( 'Vector configuration must be a dictionary.\nFound: {}' .format(type(vector_config))) else: for k in vector_config.keys(): self.vector_reader.set_reader(k, vector_config.get(k)) #self.vector_reader.vectorize() if window_width < 2: raise ValueError( 'Window width cannot be less than {}.\nFound: {}'.format( 2, window_width)) else: self.window_width = window_width self.relation_reader = vec.listOneHotVectorReader( element_list=self.input_reader.get_key_elements( key=utils.RELATION)) self.input_data_matrix = {} self.output_data_matrix = {} self.__populate_data_metrix()
def get_vector(self, key=None): if key == None: raise exp.noneValueError('Vector search key cannot be "None"') try: vpos = self.elements.index(key) except KeyError: raise KeyError( 'Key doesnot exist in the vocabulary.\nFound: {}'.format(key)) vector = npzeros(self.dimension) vector[vpos] = 1.0 return vector
def set_current_sentence(self, sentence_id=None): if sentence_id == None: raise exp.noneValueError('Sentence ID cannot be "None"') elif not isinstance(sentence_id, int): raise TypeError('Sentence ID must be a integer.\nFound: {}'.format( sentence_id)) elif 1 <= sentence_id <= self.metadata.get_sentence_count(): self.current_sentence = sentence_id else: raise ValueError( 'Sentence ID must be between {} and {}.\nFound: {}'.format( 1, self.metadata.get_sentence_count(), sentence_id))
def generate_vector(self, sentence_id=None): curSentID = self.file_reader.get_current_sentence_id() if all([e == None for e in self.vector_configuration.values()]): raise exp.noneValueError( 'All values of vector configuration is "None"') elif any([ e != None and not isinstance(e, base.vectorReader) for e in self.vector_configuration.values() ]): raise TypeError( 'Invalid vector configuration value found.\nFound: {}'.format( self.vector_configuration)) vectorKeys = sorted([ k for k in self.vector_configuration.keys() if self.vector_configuration.get(k) != None ]) self.file_reader.set_current_sentence(sentence_id) curSentence = self.file_reader.get_current_sentence() if curSentence == None: raise exp.noneValueError('Sentence cannot be "None"') elif not isinstance(curSentence, list): raise TypeError('Sentence must be a list.\nFound: {}'.format( type(curSentence))) elif not all( [isinstance(e, base.annotatedString) for e in curSentence]): raise TypeError( 'Sentence must be a list of "annotatedString".\nFound: {}', [type(e) for e in curSentence]) curSentenceMap = {} for tok in curSentence: vector_list = [] for key in vectorKeys: vector_list.append( self.vector_configuration.get(key).get_vector( tok.getValue(key))) curSentenceMap[tok.getValue(utils.TID)] = npcat(vector_list) if self.vector_dimension == None: self.vector_dimension = len(curSentenceMap[1]) self.file_reader.set_current_sentence(curSentID) return curSentenceMap
def vectorize(self): self.file_reader.reset() if all([e == None for e in self.vector_configuration.values()]): raise exp.noneValueError( 'All values of vector configuration is "None"') elif any([ e != None and not isinstance(e, base.vectorReader) for e in self.vector_configuration.values() ]): raise TypeError( 'Invalid vector configuration value found.\nFound: {}'.format( self.vector_configuration)) vectorKeys = sorted([ k for k in self.vector_configuration.keys() if self.vector_configuration.get(k) != None ]) curSentence = self.file_reader.get_current_sentence() while True: if curSentence == None: raise exp.noneValueError('Sentence cannot be "None"') elif not isinstance(curSentence, list): raise TypeError('Sentence must be a list') elif not all( [isinstance(e, base.annotatedString) for e in curSentence]): raise TypeError('Sentence must be a list of "annotatedString"') curSentenceMap = {} for tok in curSentence: vector_list = [] for key in vectorKeys: vector_list.append( self.vector_configuration.get(key).get_vector( tok.getValue(key))) curSentenceMap[tok.getValue(utils.TID)] = npcat(vector_list) self.sentence_map[ self.file_reader.get_current_sentence_id()] = curSentenceMap try: curSentence = self.file_reader.get_next_sentence() except exp.lastElementWarning: break
def __init__(self, token=None, token_def=utils.CONLL_TOKEN_DEFINITION): if token == None: raise exp.noneValueError('Token cannot be "None"') elif not isinstance(token, list): raise TypeError('Token must be a list.\nFound: {}'.format( type(token))) elif token_def == None: raise exp.noneValueError('Token Definition cannot be "None"') elif not isinstance(token_def, list): raise TypeError( 'Token Definition must be a list.\nFound: {}'.format( type(token_def))) elif len(token) != len(token_def): raise exp.unequalValueError( 'Token list size doesnot match the definition list size.\n{}(Token):{}(Definition)' .format(len(token), len(token_def))) else: self.annotation_map = { token_def[i]: int(token[i]) if token_def[i] in [utils.TID, utils.RELATION_HEAD] else token[i].lower() for i in range(len(token)) if token_def[i] != utils.NOT_IN_USE }
def get_dataset(self, **kwargs): retMap = {} if len(kwargs) == 0: kwargs['all'] = 1.0 retMap['all'] = [[], []] else: for key, value in kwargs.items(): if value == None: raise exp.noneValueError( 'Dataset split value cannot be "None"') elif not isinstance(value, float): raise TypeError( 'Dataset split value must be of numaric type./nFound: {}' .format(type(value))) elif value < 0: raise exp.smallerValueError( 'Dataset split value cannot be less than 0.\nFound: {}' .format(value)) elif value > 1.0: raise exp.greaterValueError( 'Dataset split value cannot be greater than 1.\nFound: {}' .format(value)) else: retMap[key] = [[], []] if sum(kwargs.values()) != 1.0: raise ValueError( 'The sum of dataset split valuse must be equal to 1.0.\nFound: {}' .format(sum(kwargs.values()))) elements = self.__get_datapoint_index_list() splitCountMap = {k: int(len(elements) * v) for k, v in kwargs.items()} # debug if sum(splitCountMap.values()) < len(elements): print '>>>DEBUG: Real Total (', len( elements), ') :: Split Total (', sum( splitCountMap.values()), ')' for key, value in splitCountMap.items(): if len(elements) < value: value = len(elements) for i in range(value): cIndex = elements.pop(randint(0, len(elements) - 1)) retMap[key][0].append(self.input_data_matrix.get(cIndex)) retMap[key][1].append(self.output_data_matrix.get(cIndex)) if len(elements): key = splitCountMap.keys()[randint(0, len(splitCountMap.keys()) - 1)] for e in elements: retMap[key][0].append(self.input_data_matrix.get(e)) retMap[key][1].append(self.output_data_matrix.get(e)) return retMap
def updateReader(self, update_elements=None): if update_elements == None: raise exp.noneValueError( 'Additional element list cannot be "None"') elif not isinstance(update_elements, list): raise TypeError( 'Additional element list must be a list object.\nFound: <{}>'. format(type(update_elements))) elif not len(update_elements): raise exp.zeroLengthValueError( 'Additional element list cannot be empty.') else: # filter elements and add them to the elements list self.elements.extend( [e for e in update_elements if e not in self.elements]) if len(self.elements) > self.dimension: raise exp.greaterValueError( 'Exceeding vector dimension limit.\n{}(old):{}(new)'. format(self.dimension, len(self.elements)))
def __init__(self, element_list=None, dimension_multiplier=1.3): # test element list --------------------------------------------------- if element_list == None: raise exp.noneValueError('Element list cannot be "None"') elif not isinstance(element_list, list): raise TypeError( 'Element list must be a list object.\nFound: <{}>'.format( type(element_list))) elif not len(element_list): raise exp.zeroLengthValueError('Element list cannot be empty.') else: self.elements = element_list # test dimension ------------------------------------------------------ if dimension_multiplier == None: dimension_multiplier = 1.3 elif dimension_multiplier < 1.0: raise exp.smallerValueError( 'Dimension multiplier cannot be smaller than 1.\nFound: {}'. format(dimension_multiplier)) self.dimension = int(round(len(self.elements) * dimension_multiplier))
def get_key_elements(self, key=None): if key == None: raise exp.noneValueError('Element key cannot be "None"') elif not isinstance(key, int): raise TypeError('Configuration key must be int.\nFound: {}'.format( type(key))) elif key == utils.TOKEN: return self.metadata.get_token_list() elif key == utils.LEMMA: return self.metadata.get_lemma_list() elif key == utils.GPOS: return self.metadata.get_generic_pos_list() elif key == utils.POS: return self.metadata.get_pos_list() elif key == utils.MORPH: return self.metadata.get_morphological_class_value_map() elif key == utils.RELATION: return self.metadata.get_relation_list() else: raise KeyError('Unidentified key detected.\nFound: {}'.format(key))
def __init__(self, file_reader=None): # test reader --------------------------------------------------------- if file_reader == None: raise exp.noneValueError('Data file reader cannot be "None"') elif not isinstance(file_reader, base.fileReader): raise TypeError( 'Reader must be a fileReader object.\nFound: <{}>'.format( type(file_reader))) else: self.file_reader = file_reader # initiate base configuration ----------------------------------------- self.vector_configuration = { utils.TOKEN: None, utils.LEMMA: None, utils.GPOS: None, utils.POS: None, utils.MORPH: None } # by default no embeddings shall be used ------------------------------ self.sentence_map = {} self.vector_dimension = None
def get_vector(self, key=None): if key == None: raise exp.noneValueError('Vector search key cannot be "None"') elif not isinstance(key, dict): raise TypeError( 'Vector search key must be a dict object.\nFound: <{}>'.format( type(key))) for k in key.keys(): if k not in self.classes: raise KeyError( 'Class doesnot exist in the vocabulary.\nFound: {}'.format( k)) elif key.get(k) not in self.elements.get(k): raise KeyError( 'A value for the class::{} doesnot exist.\nFound: {}'. format(k, key.get(k))) vectorList = [] for c in self.classes: vectorPart = npzeros(self.dimension.get(c)) if c in key.keys(): vectorPart[self.elements.get(c).index(key.get(c))] = 1.0 vectorList.append(vectorPart) return npcat(vectorList)
def doesTheFileExist( file_path=None): # - DEF::START -------------------------- """ Method to check if file path (*file_path*) is correct and the file actually exists. :param str file_path: The full path of the file to be tested. :return: True. :rtype: bool :raise noneFilePathError: If the the file path is None. :raise nonStringFilePathError: If the the file path is not a String. :raise zeroLengthStringPathError: If the the file path is a zero length String. :raise pathDoesNotExistError: If file path doesnot exist. :raise newFileIOError: The directory exists but the file does not. :raise pathIsDirectoryException: If the path is a directory. >>> doesTheFileExist(file_path='/my/own/path/fakedata.conll') True """ if file_path == None: raise exp.noneValueError('None was passed as file path') elif not isinstance(file_path, basestring): raise TypeError('File path is not a string.\nFound: <{}>'.format( type(file_path))) elif not len(file_path): raise exp.zeroLengthValueError('Empty string was passes as file path') elif not os.path.exists(file_path): if os.path.isdir(file_path): raise exp.pathTypeIOError( 'Path is not a file\nFound: <{}>'.format(file_path)) elif os.path.isdir(os.path.dirname(file_path)): raise exp.newFileIOError( 'Found parent directory but the file deos not exist') raise exp.invalidPathIOError( 'Path does not exist.\nFound: <{}>'.format(file_path)) return True
def __init__(self, input_file=None, meta_file=None, save_meta=True): # DEF:: START -------- # check input file path ----------------------------------------------- try: utils.doesTheFileExist(file_path=input_file) except: raise IOError('Input File I/O error.') # split the file path and the file name ------------------------------- head, tail = os.path.split(input_file) # class variables ----------------------------------------------------- self.file_location = head #-------------------------------------------- input file path self.file_name, self.file_extension = os.path.splitext( tail) #--------- input file name and extension self.file_hash_value = None #------------------------------------------ hash value of the input file (for change in file monitoring) self.sentence_configuration = { } #------------------------------------- sentence number to file pointer offset map (starting with sentence 0) self.token_distribution_map = {} self.lemma_distribution_map = {} self.gpos_distribution_map = {} self.pos_distribution_map = {} self.morphology_distribution_map = {} self.relation_distribution_map = {} # slelct the metadata file ... either the provided path or local path - skip_loading = False try: utils.doesTheFileExist(file_path=meta_file) except exp.newFileIOError: skip_loading = True except (TypeError, ValueError, exp.invalidPathIOError): meta_file = self.file_location + '/' + self.file_name + utils.META_EXTENSION except exp.pathTypeIOError: meta_file = meta_file + '/' + self.file_name + utils.META_EXTENSION # analyze or load metadata -------------------------------------------- try: if skip_loading: raise exp.skipStepWarning # attempting to load metadata ------------------------------------- self.load_metadate( meta_file=meta_file, current_hash=utils.generate_hash(source_file=input_file)) except Warning: print >> sys.stderr, 'WARNING: running analysis ...' print >> sys.stderr, 'Metadata: {}'.format(meta_file) except StandardError as e: print >> sys.stderr, 'WARNING: Failed loading metadata file ... running analysis.' print >> sys.stderr, e # running analysis ---------------------------------------------------- self.analyze(in_file=input_file) # saving metadata ----------------------------------------------------- if save_meta == None: raise exp.noneValueError('save_meta option flag cnnot be "None"') elif not isinstance(save_meta, bool): raise TypeError( 'save_meta option flag must be bool type.\nFound: <{}>'.format( type(save_meta))) elif save_meta: try: self.save_metadata(meta_file=meta_file) except Exception as e: print >> sys.stderr, 'WARNING: Metadata file not saved... re-analysis will be needed next time.' print >> sys.stderr, e