예제 #1
0
파일: dl-conv.py 프로젝트: cerisara/deepnl
def create_trainer(args, converter, labels):
    """
    Creates or loads a neural network according to the specified args.
    :param labels: list of labels.
    """

    logger = logging.getLogger("Logger")

    if args.load:
        logger.info("Loading provided network...")
        trainer = ConvTrainer.load(args.load)
        # change learning rate
        trainer.learning_rate = args.learning_rate
        trainer.threads = args.threads
    else:
        logger.info("Creating new network...")
        # sum the number of features in all extractors' tables
        feat_size = converter.size()
        pool_size = args.window
        nn = ConvolutionalNetwork(feat_size * pool_size, args.hidden, args.hidden2, len(labels), pool_size)
        options = {
            "learning_rate": args.learning_rate,
            "verbose": args.verbose,
            "left_context": args.window / 2,
            "right_context": args.window / 2,
        }
        trainer = ConvTrainer(nn, converter, labels, options)

    trainer.saver = saver(args.model, args.vectors, args.variant)

    logger.info("... with the following parameters:")
    logger.info(trainer.nn.description())

    return trainer
예제 #2
0
def create_trainer(args, converter, labels):
    """
    Creates or loads a neural network according to the specified args.
    :param labels: dict of labels.
    """

    logger = logging.getLogger("Logger")

    if args.load:
        logger.info("Loading provided network...")
        trainer = ConvTrainer.load(args.load)
        trainer.learning_rate = args.learning_rate
        trainer.threads = args.threads
    else:
        logger.info('Creating new network...')
        trainer = ConvTrainer(converter, args.learning_rate, args.window / 2,
                              args.window / 2, args.hidden, labels,
                              args.verbose)

    trainer.saver = saver(args.model, args.output)

    logger.info("... with the following parameters:")
    logger.info(trainer.nn.description())

    return trainer
예제 #3
0
파일: dl-conv.py 프로젝트: Raysor/deepnl
def create_trainer(args, converter, labels):
    """
    Creates or loads a neural network according to the specified args.
    :param labels: dict of labels.
    """

    logger = logging.getLogger("Logger")

    if args.load:
        logger.info("Loading provided network...")
        trainer = ConvTrainer.load(args.load)
        trainer.learning_rate = args.learning_rate
        trainer.threads = args.threads
    else:
        logger.info('Creating new network...')
        trainer = ConvTrainer(converter, args.learning_rate,
                              args.window/2, args.window/2,
                              args.hidden, labels, args.verbose)

    trainer.saver = saver(args.model, args.output)

    logger.info("... with the following parameters:")
    logger.info(trainer.nn.description())
    
    return trainer
예제 #4
0
def create_trainer(args, converter, labels):
    """
    Creates or loads a neural network according to the specified args.
    :param labels: list of labels.
    """

    logger = logging.getLogger("Logger")

    if args.load:
        logger.info("Loading provided network...")
        trainer = ConvTrainer.load(args.load)
        # change learning rate
        trainer.learning_rate = args.learning_rate
        trainer.threads = args.threads
    else:
        logger.info('Creating new network...')
        # sum the number of features in all extractors' tables
        feat_size = converter.size()
        pool_size = args.window * 2 + 1
        nn = ConvolutionalNetwork(feat_size * pool_size, args.hidden,
                                  args.hidden2, len(labels), pool_size)
        options = {
            'learning_rate': args.learning_rate,
            'eps': args.eps,
            'verbose': args.verbose,
            'left_context': args.window,
            'right_context': args.window
        }
        trainer = ConvTrainer(nn, converter, labels, options)

    trainer.saver = saver(args.model, args.vectors, args.variant)

    logger.info("... with the following parameters:")
    logger.info(trainer.nn.description())

    return trainer
예제 #5
0
파일: dl-conv.py 프로젝트: ravi-ojus/deepnl
def main():

    # set the seed for replicability
    np.random.seed(42)  # DEBUG

    defaults = {}

    parser = argparse.ArgumentParser(description="Convolutional network classifier.")

    parser.add_argument("-c", "--config", dest="config_file", help="Specify config file", metavar="FILE")

    # args, remaining_argv = parser.parse_known_args()

    # if args.config_file:
    #     config = ConfigParser.SafeConfigParser()
    #     config.read([args.config_file])
    #     defaults = dict(config.items('Defaults'))

    # parser.set_defaults(**defaults)

    parser.add_argument("model", type=str, help="Model file to train/use.")

    # training options
    train = parser.add_argument_group("Train")

    train.add_argument("-t", "--train", type=str, default=None, help="File with annotated data for training.")

    train.add_argument("-w", "--window", type=int, default=5, help="Size of the word window (default 5)")
    train.add_argument(
        "-s",
        "--embeddings-size",
        type=int,
        default=50,
        help="Number of features per word (default 50)",
        dest="embeddings_size",
    )
    train.add_argument(
        "-e", "--epochs", type=int, default=100, help="Number of training epochs (default 100)", dest="iterations"
    )
    train.add_argument(
        "-l",
        "--learning_rate",
        type=float,
        default=0.001,
        help="Learning rate for network weights (default 0.001)",
        dest="learning_rate",
    )
    train.add_argument("-n", "--hidden", type=int, default=200, help="Number of hidden neurons (default 200)")
    train.add_argument("-n2", "--hidden2", type=int, default=200, help="Number of hidden neurons (default 200)")

    # Extractors:
    extractors = parser.add_argument_group("Extractors")
    extractors.add_argument(
        "--caps",
        const=5,
        nargs="?",
        type=int,
        default=None,
        help="Include capitalization features. Optionally, supply the number of features (default 5)",
    )
    extractors.add_argument(
        "--suffix",
        const=5,
        nargs="?",
        type=int,
        default=None,
        help="Include suffix features. Optionally, supply the number of features (default 5)",
    )
    extractors.add_argument("--suffixes", type=str, default="", help="Load suffixes from this file")
    extractors.add_argument(
        "--prefix",
        const=0,
        nargs="?",
        type=int,
        default=None,
        help="Include prefix features. Optionally, " "supply the number of features (default 0)",
    )
    extractors.add_argument("--prefixes", type=str, default="", help="Load prefixes from this file")
    # Embeddings
    embeddings = parser.add_argument_group("Embeddings")
    embeddings.add_argument("--vocab", type=str, default=None, help="Vocabulary file, either read or created")
    embeddings.add_argument("--vectors", type=str, default=None, help="Embeddings file, either read or created")
    embeddings.add_argument(
        "--min-occurr", type=int, default=3, help="Minimum occurrences for inclusion in vocabulary", dest="minOccurr"
    )
    embeddings.add_argument("--load", type=str, default=None, help="Load previously saved model")
    embeddings.add_argument(
        "--variant", type=str, default=None, help='Either "senna" (default), "polyglot" or "word2vec".'
    )

    # common
    parser.add_argument("--threads", type=int, default=1, help="Number of threads (default 1)")
    parser.add_argument("-v", "--verbose", help="Verbose mode", action="store_true")

    # Use this for obtaining defaults from config file:
    # args = arguments.get_args()
    args = parser.parse_args()

    log_format = "%(message)s"
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    if args.train:
        reader = ClassifyReader()
        # a generator (can be iterated several times)
        sentences = reader.read(args.train)

        if args.vocab and os.path.exists(args.vocab):
            # start with the given vocabulary
            base_vocab = reader.load_vocabulary(args.vocab)
            if os.path.exists(args.vectors):
                # load vectors
                embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant)
            else:
                # create vectors
                embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant)
            # collect words from the corpus
            # build vocabulary
            vocab, bigrams, trigrams = reader.create_vocabulary(
                sentences,
                # size=args.vocab_size,
                min_occurrences=args.minOccurr,
            )
            # add them to the given vocabulary
            embeddings.merge(vocab)
            logger.info("Overriding vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)

        elif args.variant == "word2vec":
            if os.path.exists(args.vectors):
                embeddings = Embeddings(vectors=args.vectors, variant=args.variant)
                vocab, bigrams, trigrams = reader.create_vocabulary(
                    sentences,
                    # args.vocab_size,
                    min_occurrences=args.minOccurr,
                )
                embeddings.merge(vocab)
            else:
                embeddings = Embeddings(vectors=args.vectors, variant=args.variant)
            if args.vocab:
                logger.info("Creating vocabulary in %s" % args.vocab)
                embeddings.save_vocabulary(args.vocab)

        elif args.vocab:
            if not args.vectors:
                logger.error("No --vectors specified")
                return
            embeddings = Embeddings(args.embeddings_size, args.vocab, args.vectors, variant=args.variant)
            logger.info("Creating vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)

        else:
            # build vocabulary and tag set
            vocab, bigrams, trigrams = reader.create_vocabulary(
                sentences,
                # args.vocab_size,
                min_occurrences=args.minOccurr,
            )
            logger.info("Creating vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)
            logger.info("Creating word embeddings")
            embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant)

        converter = Converter()
        converter.add(embeddings)

        if args.caps:
            logger.info("Creating capitalization features...")
            converter.add(CapsExtractor(args.caps))

        if os.path.exists(args.suffixes):
            logger.info("Loading suffix list...")
            extractor = SuffixExtractor.create(args.suffix, args.suffixes)
            converter.add(extractor)
        elif args.suffix:
            logger.info("Creating suffix list...")
            # collect the forms
            words = (tok[0] for sent in sentences for tok in sent)
            extractor = SuffixExtractor(args.suffix, args.suffixes, words)
            converter.add(extractor)
            if args.suffixes:
                logger.info("Saving suffix list to: %s", args.suffixes)
                extractor.write(args.suffixes)
        if os.path.exists(args.prefixes):
            logger.info("Loading prefix list...")
            extractor = PrefixExtractor.create(args.prefix, args.prefixes)
            converter.add(extractor)
        elif args.prefix:
            logger.info("Creating prefix list...")
            # collect the forms
            words = (tok[0] for sent in sentences for tok in sent)
            extractor = PrefixExtractor(args.prefix, args.prefixes, words)
            converter.add(extractor)
            if args.prefixes:
                logger.info("Saving prefix list to: %s", args.prefixes)
                extractor.write(args.prefixes)

        # labels from all examples
        examples = []
        for example in sentences:
            examples.append(converter.convert(example))
        # assign index to labels
        labels = reader.polarities
        labels_index = {c: i for i, c in enumerate(set(labels))}
        labels_ids = [labels_index[i] for i in labels]

        trainer = create_trainer(args, converter, labels_index)
        logger.info("Starting training with %d examples" % len(examples))

        report_frequency = max(args.iterations / 200, 1)
        report_frequency = 1  # DEBUG
        trainer.train(examples, labels_ids, args.iterations, report_frequency, args.threads)

        logger.info("Saving trained model ...")
        trainer.saver(trainer)
        logger.info("... to %s" % args.model)

    else:
        # predict
        with open(args.model) as file:
            classifier = ConvTrainer.load(file)
        reader = ClassifyReader(args.test)

        for example in reader:
            text = example[text_field]
            input = classifier.converter.convert(text)
            example[reader.label_field] = classifier.nn.forward(input).argmax()
            print example