def punctuation_training(args, loss_fn):
    """Train the model for the punctuation task."""
    text_encoder_type = _text_encoder_type(args.text_encoder)

    train_dl = dataloader.AlignedDataloader(
        file_name_input=args.src_train,
        file_name_target=args.target_train,
        vocab_size=args.vocab_size,
        text_encoder_type=text_encoder_type,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )
    valid_dl = dataloader.AlignedDataloader(
        file_name_input=args.src_valid,
        file_name_target=args.target_valid,
        vocab_size=args.vocab_size,
        text_encoder_type=text_encoder_type,
        encoder_input=train_dl.encoder_input,
        encoder_target=train_dl.encoder_target,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )
    model = models.find(args, train_dl.encoder_input.vocab_size,
                        train_dl.encoder_target.vocab_size)
    optim = _create_optimizer(model.embedding_size, args)
    training = Training(model, train_dl, valid_dl, [base.Metrics.BLEU])
    training.run(
        loss_fn,
        optim,
        batch_size=args.batch_size,
        num_epoch=args.epochs,
        checkpoint=args.checkpoint,
    )
def translate(args):
    """Translate user's input."""
    # Used to load the train text encoders.
    text_encoder_type = TextEncoderType(args.text_encoder)
    train_dl = dataloader.AlignedDataloader(
        file_name_input="data/splitted_data/sorted_train_token.en",
        file_name_target=
        "data/splitted_data/sorted_nopunctuation_lowercase_train_token.fr",
        vocab_size=args.vocab_size,
        text_encoder_type=text_encoder_type,
    )
    encoder_input = train_dl.encoder_input
    encoder_target = train_dl.encoder_target

    # Load the model.
    model = models.find(args, encoder_input.vocab_size,
                        encoder_target.vocab_size)
    model.load(str(args.checkpoint))

    # Create the message to translate.
    message = preprocessing.add_start_end_token([args.message])[0]
    x = tf.convert_to_tensor([train_dl.encoder_input.encode(message)])

    # Translate the message.
    translated = model.translate(x, encoder_target, args.max_seq_length)
    translated_message = model.predictions(translated,
                                           encoder_target,
                                           logit=False)
    logger.info(f"Translation is {translated_message}")
def pretraining(args, loss_fn):
    """Pretraining the model."""
    text_encoder_type = _text_encoder_type(args.text_encoder)

    train_dl = dataloader.UnalignedDataloader(
        file_name=args.src_train,
        vocab_size=args.vocab_size,
        text_encoder_type=text_encoder_type,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )
    valid_dl = dataloader.UnalignedDataloader(
        file_name=args.src_valid,
        vocab_size=args.vocab_size,
        text_encoder_type=text_encoder_type,
        encoder=train_dl.encoder,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )
    model = models.find(args, train_dl.encoder.vocab_size,
                        train_dl.encoder.vocab_size)
    optim = _create_optimizer(model.embedding_size, args)
    pretraining = Pretraining(model, train_dl, valid_dl)
    pretraining.run(
        loss_fn,
        optim,
        batch_size=args.batch_size,
        num_epoch=args.epochs,
        checkpoint=args.checkpoint,
    )
예제 #4
0
def build(term,focusA):
    data = find(term, focusA, True)
    template = PageTemplateFile("template.txt","")
    #    h = "///Users/mutdmour/Dropbox/IsisCrawler/public_html/"
    h = "http://classsearch.bme.jhu.edu/"
    t = template(Term=term,FocusArea=focusA,all=data, host=h, time=time.strftime("%c"))
    file = open("./public_html/{0}/{1}.html".format(term,focusA),'w')
    file.writelines(t)
    file.close()
def default_training(args, loss_fn):
    """Train the model."""
    text_encoder_type = _text_encoder_type(args.text_encoder)

    if args.pretrained is not None:
        pretrained_dl = dataloader.UnalignedDataloader(
            file_name=args.pretrained,
            vocab_size=args.vocab_size,
            text_encoder_type=text_encoder_type,
            max_seq_length=args.max_seq_length,
            cache_dir=_cache_dir(args),
        )
        train_dl = dataloader.AlignedDataloader(
            file_name_input=args.src_train,
            file_name_target=args.target_train,
            text_encoder_type=text_encoder_type,
            vocab_size=args.vocab_size,
            encoder_input=pretrained_dl.encoder,
            max_seq_length=args.max_seq_length,
            cache_dir=_cache_dir(args),
        )
    else:
        train_dl = dataloader.AlignedDataloader(
            file_name_input=args.src_train,
            file_name_target=args.target_train,
            vocab_size=args.vocab_size,
            text_encoder_type=text_encoder_type,
            max_seq_length=args.max_seq_length,
            cache_dir=_cache_dir(args),
        )
    valid_dl = dataloader.AlignedDataloader(
        file_name_input=args.src_valid,
        file_name_target=args.target_valid,
        vocab_size=args.vocab_size,
        text_encoder_type=text_encoder_type,
        encoder_input=train_dl.encoder_input,
        encoder_target=train_dl.encoder_target,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )
    logger.debug(valid_dl.encoder_target.vocab_size)
    logger.debug(valid_dl.encoder_input.vocab_size)
    logger.debug(train_dl.encoder_target.vocab_size)
    logger.debug(train_dl.encoder_input.vocab_size)
    model = models.find(args, train_dl.encoder_input.vocab_size,
                        train_dl.encoder_target.vocab_size)
    optim = _create_optimizer(model.embedding_size, args)
    training = Training(model, train_dl, valid_dl, [base.Metrics.BLEU])
    training.run(
        loss_fn,
        optim,
        batch_size=args.batch_size,
        num_epoch=args.epochs,
        checkpoint=args.checkpoint,
    )
def test(args, loss_fn):
    """Test the model."""
    text_encoder_type = _text_encoder_type(args.text_encoder)
    # Used to load the train text encoders.
    if args.pretrained is not None:
        pretrained_dl = dataloader.UnalignedDataloader(
            file_name=args.pretrained,
            vocab_size=args.vocab_size,
            text_encoder_type=text_encoder_type,
            max_seq_length=args.max_seq_length,
            cache_dir=_cache_dir(args),
        )
        train_dl = dataloader.AlignedDataloader(
            file_name_input=args.src_train,
            file_name_target=args.target_train,
            text_encoder_type=text_encoder_type,
            vocab_size=args.vocab_size,
            encoder_input=pretrained_dl.encoder,
            max_seq_length=args.max_seq_length,
            cache_dir=_cache_dir(args),
        )
    else:
        train_dl = dataloader.AlignedDataloader(
            file_name_input=args.src_train,
            file_name_target=args.target_train,
            vocab_size=args.vocab_size,
            text_encoder_type=text_encoder_type,
            max_seq_length=args.max_seq_length,
            cache_dir=_cache_dir(args),
        )
    test_dl = dataloader.AlignedDataloader(
        file_name_input="data/splitted_data/test/test_token10000.en",
        file_name_target="data/splitted_data/test/test_token10000.fr",
        vocab_size=args.vocab_size,
        encoder_input=train_dl.encoder_input,
        encoder_target=train_dl.encoder_target,
        text_encoder_type=text_encoder_type,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )
    model = models.find(args, train_dl.encoder_input.vocab_size,
                        train_dl.encoder_target.vocab_size)
    base.test(model, loss_fn, test_dl, args.batch_size, args.checkpoint)
예제 #7
0
def predict(args):
    """Translate user's input."""
    # Used to load the train text encoders.
    print("Instanciating dataloader...")
    train_dl = UnalignedDataloader(
        file_name="data/splitted_english_data/sorted_clean_train.en",
        cache_dir=".cache/data/splitted_english_data/sorted_clean_train.en",
        text_encoder_type=TextEncoderType("subword"),
        vocab_size=8192,
    )
    encoder = train_dl.encoder
    print("Creating model...")

    # Load the model.
    model = models.find(args, encoder.vocab_size, encoder.vocab_size)
    model.load(str(args.checkpoint))

    # Create the message to translate.
    message = preprocessing.add_start_end_token([args.message])[0]
    x = tf.convert_to_tensor([encoder.encode(message)])
    pretraining.test(x, model, encoder)
예제 #8
0
def addpost(username, title, tags, text, about):
    user = find(username)
    post = Node('Post',
                id=(str(uuid.uuid4())) + username,
                title=title,
                text=text,
                about=about,
                timestamp=timestamp(),
                date=date())
    rel = Relationship(user, 'PUBLISHED', post)
    graph.create(rel)

    tags = [x.strip() for x in tags.lower().split(',')]
    for name in set(tags):
        tag = Node('Tag', name=name)
        graph.merge(
            tag
        )  #MERGE command checks whether this node is available in the database or not.

        rel = Relationship(tag, 'TAGGED', post)
        graph.create(rel)
예제 #9
0
def generate_predictions(input_file_path: str, pred_file_path: str):
    """Generates predictions for the machine translation task (EN->FR).

    You are allowed to modify this function as needed, but one again, you cannot
    modify any other part of this file. We will be importing only this function
    in our final evaluation script. Since you will most definitely need to import
    modules for your code, you must import these inside the function itself.

    Args:
        input_file_path: the file path that contains the input data.
        pred_file_path: the file path where to store the predictions.

    Returns: None

    """
    logger.info(
        f"Generate predictions with input {input_file_path} {pred_file_path}")
    settings = BackTranslationPretrainedDemiBertTransformer()
    encoder_input, encoder_target = _load_encoders(settings)

    # Load the model.
    model = models.find(settings, encoder_input.vocab_size,
                        encoder_target.vocab_size)
    model.load(str(settings.checkpoint))

    dl = dataloader.UnalignedDataloader(
        file_name=input_file_path,
        vocab_size=settings.vocab_size,
        text_encoder_type=settings.text_encoder,
        max_seq_length=settings.max_seq_length,
        cache_dir=None,
        encoder=encoder_input,
    )

    predictions = _generate_predictions(model, dl, encoder_input,
                                        encoder_target, settings.batch_size)
    base.write_text(predictions, pred_file_path)
예제 #10
0
def test_read(api):
    import codecs
    writer = codecs.open('cybozulive.txt', 'w+', encoding='utf-8')

    def printer(item):
        print >>writer, "-" * 79
        print >>writer, "%s (%s) %s - %s" % (item.title, item.id,
            item.updated, item.author)
        print >>writer, '    <%s>' % (item.link,)
        if hasattr(item, 'when') and item.when:
            print >>writer, "\n".join(["%s to %s" % w for w in item.when])
        if hasattr(item, 'who') and item.who:
            print >>writer, "%s" % (item.who,)
        if hasattr(item, 'group') and item.group:
            print >>writer, "GROUP: %s, %s" % item.group[0]
        if item.summary:
            print >>writer, item.summary

    #headlines = api.notification()
    #for item in headlines:
    #    printer(item)

    headlines = api.notification({'unconfirmed': True})
    for item in headlines:
        printer(item)
        ret = models.find(Notification, id=item.id)
        if ret is None:
            item.put()

    #headlines = api.notification({'category': 'MEMBER_LIST'})
    #for item in headlines:
    #    printer(item)

    tasks = api.task()
    for item in tasks:
        printer(item)

    tasks = api.task({'embed-comment': True})
    for item in tasks:
        printer(item)

    tasks = api.task({'embed-comment': True, 'group': '1:1'})
    for item in tasks:
        printer(item)

    events = api.schedule()
    for item in events:
        printer(item)

    events = api.schedule({'embed-comment': True})
    for item in events:
        printer(item)

    topics = api.board('1:1')
    assert topics is None

    topics = api.board('1:1', {'embed-comment': True})
    assert topics is None

    #groups = api.group()
    #for item in groups:
    #    printer(item)

    # under construction.
    #comments = api.comment()
    #for item in comments:
    #    printer(item)

    #comments = api.comment({'item': 'GROUP,1:1,BOARD,1:1'})
    #for item in comments:
    #    printer(item)

    writer.close()
def back_translation_training(args, loss_fn):
    """Train the model with back translation."""
    text_encoder_type = _text_encoder_type(args.text_encoder)

    logger.info("Creating training unaligned dataloader ...")
    train_dl = dataloader.UnalignedDataloader(
        "data/unaligned.en",
        args.vocab_size,
        text_encoder_type=text_encoder_type,
        max_seq_length=args.max_seq_length,
    )
    logger.info(f"English vocab size: {train_dl.encoder.vocab_size}")

    logger.info("Creating reversed training unaligned dataloader ...")
    train_dl_reverse = dataloader.UnalignedDataloader(
        "data/unaligned.fr",
        args.vocab_size,
        text_encoder_type=text_encoder_type,
        max_seq_length=args.max_seq_length,
    )
    logger.info(f"French vocab size: {train_dl_reverse.encoder.vocab_size}")

    logger.info("Creating training aligned dataloader ...")
    aligned_train_dl = dataloader.AlignedDataloader(
        file_name_input="data/splitted_data/sorted_train_token.en",
        file_name_target=
        "data/splitted_data/sorted_nopunctuation_lowercase_val_token.fr",
        vocab_size=args.vocab_size,
        encoder_input=train_dl.encoder,
        encoder_target=train_dl_reverse.encoder,
        text_encoder_type=text_encoder_type,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )

    logger.info("Creating reversed training aligned dataloader ...")
    aligned_train_dl_reverse = dataloader.AlignedDataloader(
        file_name_input=
        "data/splitted_data/sorted_nopunctuation_lowercase_val_token.fr",
        file_name_target="data/splitted_data/sorted_train_token.en",
        vocab_size=args.vocab_size,
        encoder_input=aligned_train_dl.encoder_target,
        encoder_target=aligned_train_dl.encoder_input,
        text_encoder_type=text_encoder_type,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )

    logger.info("Creating valid aligned dataloader ...")
    aligned_valid_dl = dataloader.AlignedDataloader(
        file_name_input="data/splitted_data/sorted_val_token.en",
        file_name_target=
        "data/splitted_data/sorted_nopunctuation_lowercase_val_token.fr",
        vocab_size=args.vocab_size,
        encoder_input=aligned_train_dl.encoder_input,
        encoder_target=aligned_train_dl.encoder_target,
        text_encoder_type=text_encoder_type,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )

    logger.info("Creating reversed valid aligned dataloader ...")
    aligned_valid_dl_reverse = dataloader.AlignedDataloader(
        file_name_input=
        "data/splitted_data/sorted_nopunctuation_lowercase_val_token.frs",
        file_name_target="data/splitted_data/sorted_val_token.en",
        vocab_size=args.vocab_size,
        encoder_input=aligned_train_dl_reverse.encoder_input,
        encoder_target=aligned_train_dl_reverse.encoder_target,
        text_encoder_type=text_encoder_type,
        max_seq_length=args.max_seq_length,
        cache_dir=_cache_dir(args),
    )

    model = models.find(
        args,
        aligned_train_dl.encoder_input.vocab_size,
        aligned_train_dl.encoder_target.vocab_size,
    )

    optim = _create_optimizer(model.embedding_size, args)
    model_reverse = models.find(
        args,
        aligned_train_dl_reverse.encoder_input.vocab_size,
        aligned_train_dl_reverse.encoder_target.vocab_size,
    )

    training = BackTranslationTraining(
        model,
        model_reverse,
        train_dl,
        train_dl_reverse,
        aligned_train_dl,
        aligned_train_dl_reverse,
        aligned_valid_dl,
        aligned_valid_dl_reverse,
    )

    training.run(
        loss_fn,
        optim,
        batch_size=args.batch_size,
        num_epoch=args.epochs,
        checkpoint=args.checkpoint,
    )
예제 #12
0
 def reset(self):
     self.model = models.find(self.model_cfg.name)(self.model_cfg, self.dataset.cfg, self.run_cfg)
     self.start_epoch = self.model.load(self.args.test_epoch)
     self.logger = self._get_logger()
     self.show_cfgs()
예제 #13
0
def like_post(username, post_id):
    user = find(username)
    post = graph.find_one('Post', 'id', post_id)
    graph.merge(Relationship(user, 'LIKED', post))
예제 #14
0
 #predict
 core = 1
 queue = []
 running = [-1 for i in range(core)]
 runtime = [0 for i in range(core)]
 incnt = [len(nodein[i]) for i in range(n)]
 for i in range(n):
     if incnt[i] == 0:
         queue.append(i)
 time = 0
 while (len(queue) > 0):
     # add task
     for i in range(core):
         if running[i] == -1 and len(queue) > 0:
             node = queue[0]
             t = models.find(nodename[node])
             queue.pop(0)
             running[i] = node
             runtime[i] = models.get(t, parameter[node], device)
     # run task
     tmp = []
     for i in range(core):
         if running[i] != -1:
             tmp.append(runtime[i])
     t = min(tmp)
     time += t
     for i in range(core):
         if running[i] != -1:
             runtime[i] -= t
     for i in range(core):
         if running[i] != -1 and runtime[i] == 0:
예제 #15
0
    if not args:
        raise SystemExit("No target was given.")
    target = args[0]
    try:
        ck = parser.get(target, "consumer_key")
        secret = parser.get(target, "consumer_secret")
        if not (ck and secret):
            msg = ("No consumer token was found.", "Check 'consumer_key' and 'consumer_secret' on %s." % (target))
            raise SystemExit("\n".join(msg))
    except ConfigParser.NoOptionError, e:
        raise SystemExit(e.message)
    consumer_token = (ck, secret)

    sample_user = "******"
    API = load_object(SERVICE[target])
    ret = models.find(models.AccessToken, service_provider_name=target, user_name=sample_user)
    if ret:
        access_token = (ret.oauth_token_key, ret.oauth_token_secret)
        client = create_client(consumer_token, access_token)
        api = API(client)
        if target in SERVICE_SAMPLES:
            run = load_object(SERVICE_SAMPLES[target])
            run(api)
        else:
            logging.warn("No sample was found for %s." % (target,))
    else:
        client = create_client(consumer_token)
        api = API(client)
        access_token = api.initialize()
        if access_token:
            assert len(access_token) == 2