Exemplo n.º 1
0
def sswe_trainer(model_parameters):
    # set the seed for replicability
    np.random.seed(42)
    # args = parser.parse_args()
    args = model_parameters
    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    log_level = logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)
    # merge args with config
    reader = TweetReader(text_field=args.textField,
                         label_field=args.tagField,
                         ngrams=args.ngrams)
    reader.read(args.train)
    vocab, bigrams, trigrams = reader.create_vocabulary(
        reader.sentences, args.vocab_size, min_occurrences=args.minOccurr)
    #print("length vocab")
    #print(len(vocab))
    if args.variant == 'word2vec' and os.path.exists(args.vectors):
        embeddings = Embeddings(vectors=args.vectors, variant=args.variant)
        embeddings.merge(vocab)
        logger.info("Saving vocabulary in %s" % args.vocab)
        embeddings.save_vocabulary(args.vocab)
    elif os.path.exists(args.vocab):
        # start with the given vocabulary
        b_vocab = reader.load_vocabulary(args.vocab)
        bound = len(b_vocab) - len(bigrams) - len(trigrams)
        base_vocab = b_vocab[:bound]
        #print("length base vocab :")
        #print(len(base_vocab))
        if os.path.exists(args.vectors):
            # load embeddings
            embeddings = Embeddings(vectors=args.vectors,
                                    vocab=base_vocab,
                                    variant=args.variant)
        else:
            # create embeddings
            embeddings = Embeddings(args.embeddings_size,
                                    vocab=base_vocab,
                                    variant=args.variant)
            # add the ngrams from the corpus
            embeddings.merge(vocab)
            logger.info("Overriding vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)
    else:
        embeddings = Embeddings(args.embeddings_size,
                                vocab=vocab,
                                variant=args.variant)
        logger.info("Saving vocabulary in %s" % args.vocab)
        embeddings.save_vocabulary(args.vocab)

    # Assume bigrams are prefix of trigrams, or else we should put a terminator
    # on trie
    trie = {}
    for b in chain(bigrams, trigrams):
        tmp = trie
        for w in b:
            tmp = tmp.setdefault(embeddings.dict[w], {})

    converter = Converter()
    converter.add(embeddings)

    trainer = create_trainer(args, converter)

    report_intervals = max(args.iterations / 200, 1)
    report_intervals = 10000  # DEBUG

    logger.info("Starting training")

    # a generator expression (can be iterated several times)
    # It caches converted sentences, avoiding repeated conversions
    converted_sentences = converter.generator(reader.sentences, cache=True)
    trainer.train(converted_sentences, reader.polarities, trie,
                  args.iterations, report_intervals)

    logger.info("Overriding vectors to %s" % args.vectors)
    embeddings.save_vectors(args.vectors, args.variant)
    if args.model:
        logger.info("Saving trained model to %s" % args.model)
        trainer.save(args.model)
Exemplo n.º 2
0
    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    reader = TweetReader(text_field=args.textField, label_field=args.tagField, ngrams=args.ngrams)
    reader.read(args.train)
    vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences,
                                                        args.vocab_size,
                                                        min_occurrences=args.minOccurr)
    if args.variant == 'word2vec' and os.path.exists(args.vectors):
        embeddings = Embeddings(vectors=args.vectors, variant=args.variant)
        embeddings.merge(vocab)
        logger.info("Saving vocabulary in %s" % args.vocab)
        embeddings.save_vocabulary(args.vocab)
    elif os.path.exists(args.vocab):
        # start with the given vocabulary
        base_vocab = reader.load_vocabulary(args.vocab)
        if os.path.exists(args.vectors):
            # load embeddings
            embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab,
                                    variant=args.variant)
        else:
            # create embeddings
Exemplo n.º 3
0
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    reader = TweetReader(args.ngrams)
    reader.read(args.train)
    loaded_vocab = False
    if args.vocab and os.path.exists(args.vocab):
        loaded_vocab = True
        vocab = reader.load_vocabulary(args.vocab)
    else:
        vocab = reader.create_vocabulary(reader.sentences)
    tokens = []
    for l in vocab: tokens.extend(l) # flatten ngrams dictionaries
    embeddings = Embeddings(args.embeddings_size, vocab=tokens,
                            variant=args.variant)

    converter = Converter()
    converter.add_extractor(embeddings)

    trainer = create_trainer(args, converter)

    report_intervals = max(args.iterations / 200, 1)
    report_intervals = 10000    # DEBUG

    logger.info("Starting training")
Exemplo n.º 4
0
    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    reader = TweetReader(args.ngrams)
    reader.read(args.train)
    vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences,
                                                        min_occurrences=2)
    if os.path.exists(args.vocab):
        # start with the given vocabulary
        base_vocab = reader.load_vocabulary(args.vocab)
        if os.path.exists(args.vectors):
            embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab,
                                    variant=args.variant)
        else:
            embeddings = Embeddings(args.embeddings_size, vocab=base_vocab,
                                    variant=args.variant)
        # add the ngrams from the corpus
        embeddings.merge(vocab)
        logger.info("Overriding vocabulary in %s" % args.vocab)
        embeddings.save_vocabulary(args.vocab)
    elif args.variant == 'word2vec' and os.path.exists(args.vectors):
        embeddings = Embeddings(vectors=args.vectors,
Exemplo n.º 5
0
    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    reader = TweetReader(args.ngrams)
    reader.read(args.train)
    vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences,
                                                        min_occurrences=2)
    if os.path.exists(args.vocab):
        # start with the given vocabulary
        base_vocab = reader.load_vocabulary(args.vocab)
        if os.path.exists(args.vectors):
            embeddings = Embeddings(vectors=args.vectors,
                                    vocab=base_vocab,
                                    variant=args.variant)
        else:
            embeddings = Embeddings(args.embeddings_size,
                                    vocab=base_vocab,
                                    variant=args.variant)
        # add the ngrams from the corpus
        embeddings.merge(vocab)
        logger.info("Overriding vocabulary in %s" % args.vocab)
        embeddings.save_vocabulary(args.vocab)