def train_network(self): logger = logging.getLogger('progress_logger') logger.info("Training neural network") if self.train_path: train_documents = None validation_documents = None if self.retrain: logger.info("Reading documents") train_documents = data.read_all(self.train_path) validation_documents = data.read_all(self.validation) logger.info("Started training") if self.global_norm: model = global_norm_nn.GlobalNormNN( train_documents, validation_documents, self.retrain, self.model_name, pretrained_base=self.pretrained_base) else: model = classification.NNActions(train_documents, validation_documents, self.retrain, self.model_name) return model else: raise Exception("No path to training corpus provided")
def complete_base(): data.read_all(test_path) bp = BaseProcedure(train_path=train_path, token_window=token_window, retrain_rel=retrain_REL, retrain_dct=retrain_DCT, doc_time_path=DCT_model_name, rel_classifier_path=relation_model_name, greedy=greedy, transitive=transitive, linear=linear) # Where the magic happens bp.predict(test_path) bp.evaluate(test_path)
def __init__(self, train_path="", validation_path="", retrain_rel=False, retrain_dct=False, rel_classifier_path="", doc_time_path="", token_window=30, greedy=False, transitive=False, linear=True): """ :param train_path: Path to training corpus (not required if models don't need to be retrained) :param token_window: Window in which candidates need to be generated :param rel_classifier_path: Path to Binary relation classification (YES/NO) :param doc_time_path: Path to Doctime classifier :param greedy: TRUE: use greedy decision making on binary classifications. FALSE: use ILP inference :param transitive: Close data transitively before training and """ self.train_path = train_path self.transitive = transitive self.token_window = token_window self.greedy = greedy self.doctimepath = doc_time_path self.relpath = rel_classifier_path if greedy: self.annotator = GreedyAnnotator(token_window=token_window) else: self.annotator = InferenceAnnotator(token_window=token_window, transitive=transitive) if retrain_dct: self.doc_time_model = self.train_doctime(doc_time_path, linear) else: self.doc_time_model = utils.load_model(doc_time_path) if retrain_rel: self.annotator.model = self.train_rel_classifier( rel_classifier_path) else: self.annotator.model = utils.load_model(rel_classifier_path) # evaluation if validation_path: docs = data.read_all(validation_path, transitive=transitive) dct = os.path.join(utils.model_path, doc_time_path + "_eval.txt") rel = os.path.join(utils.model_path, rel_classifier_path + "_eval.txt") with open(dct, 'w+') as file: eval_str = self.doc_time_model.evaluate(docs) print(eval_str) file.write(eval_str) with open(rel, 'w+') as file: eval_str = self.annotator.model.evaluate(docs) print(eval_str) file.write(eval_str)
def train_rel_classifier(self, save_path, validation=""): logger = logging.getLogger('progress_logger') logger.info("Training relation classifier") if self.train_path: logger.info("Reading documents") train_documents = data.read_all(self.train_path, transitive=self.transitive)[:50] logger.info("Started training") model = classification.train_relation_classifier( train_documents, self.token_window) utils.save_model(model, save_path) return model else: raise Exception("No path to training corpus provided")
def train_doctime(self, save_path, linear): logger = logging.getLogger('progress_logger') logger.info("Training doctime classifier") if self.train_path: logger.info("Reading documents") train_documents = data.read_all(self.train_path, transitive=self.transitive) logger.info("Started training") model = classification.train_doctime_classifier(train_documents, linear=linear) utils.save_model(model, name=save_path) return model else: raise Exception("No path to training corpus provided")
data_path = os.path.abspath( ARGS.path ) # преобразование в абсолютный путь # os.path.join() - нужно использовать при назначении относительного пути. print('DATA PATH: ' + str(data_path)) logging.debug('First: {0.first} Last: {0.last}'.format( ARGS)) # дополнительные возможности форматирования строки. logging.debug('Data path: {0.path} ({1})'.format(ARGS, data_path)) logging.debug('Nodes: {0.nodes} Links: {0.links}'.format(ARGS)) logging.info('Test') logging.warning('Test') logging.error('Test') logging.critical('Test') Graph, Names = data.read_all(data_path, ARGS.nodes, ARGS.links) First = Names[ARGS.first]['number'] Last = Names[ARGS.last]['number'] for path in data.in_depth(Graph, First, Last): print(path) """ # Чтение данных в формате csv из менеджера контекста with open(nodes_path, "rt", encoding="utf-8") as src: rdr = csv.reader(src) for number, name in rdr: #Можно использовать data вместо number, name чтобы получить последовательность списков. print(number, name) """ # Присваиваю переменной значение открытия файла.
from __future__ import division import math import data import oracle import utils """ Script for testing attributes of the datasets """ dev = data.read_all(utils.dev) train = data.read_all(utils.train) def treeless(documents): treefull = 0 all = 0 treeless = 0 for document in documents: relations = document.relation_mapping parents = [x for (x, _) in relations.keys()] children = [x for (_, x) in relations.keys()] # how many relations in total are there? all += len(children) # how many children are not unique = how many children have multiple parents treefull += len(children) - len(set(children)) # how many parents are not unique = how many parents have multiple children treeless += len(parents) - len(set(parents)) print(treeless / all, treefull / all, all) def samepar_relations(documents):
x = indices.index(ind) del indices[x] del distribution[x] return None def get_training_sequence(entities, arcs, doc): # Given entities and arcs, yield the sequence of configuration and actions needed to get from the intitial # configuration to the terminal one # Is used to determine the training sequence of a document configuration = Configuration(entities, doc) oracle = KnowingOracle(arcs) while not configuration.empty_buffer(): function_string = oracle.next_step(configuration) conf_copy = cPickle.loads(cPickle.dumps(configuration, -1)) yield (conf_copy, function_string) # applies function to configuration getattr(configuration, function_string)() if __name__ == '__main__': # Test methods documents = read_all(utils.dev, transitive=False) for doc in documents: sequence = get_training_sequence(doc.get_entities(), doc.get_relations(), doc) # Should print equal amounts print(len(doc.get_relations()), len([x for x in sequence if x[1] in ["left_arc", "right_arc"]]))