Пример #1
0
def main(file, interactive=False, debug=False):
    if file is not None:
        env = Environment()

        with open(file, "r") as f:
            program_str = f.read()

        l = Lexer(program_str)
        p = Parser(l)

        program = p.parse_program()

        if debug:
            print("PARSED PROGRAM")
            print("-" * 50)
            print(program)
            print("-" * 50)

        if len(p.errors) != 0:
            repl.print_parser_errors(p.errors)
            exit(0)

        evaluated = evaluator.eval(program, env)
        if (not interactive and evaluated is not None
                and evaluated != NULL) or evaluated.typ == mobject.ERROR_OBJ:
            print(evaluated.inspect)
        if interactive:
            repl.start(env=env)
    else:
        repl.start()
Пример #2
0
def eval_test(input: str) -> MonkeyObject:
    l = lexer.Lexer(input)
    p = parser.Parser(l)
    program = p.parse_program()
    env = mobject.Environment()

    return evaluator.eval(program, env)
Пример #3
0
def test_bang_operator(input_data, expected_val):
    lexer = Lexer(input_data)
    parser = Parser.new(lexer)
    program = parser.parse()
    check_parse_errors(parser)

    output = eval(program)
    assert output.value == expected_val
Пример #4
0
def test_eval_integer_expression(input_data, expected_val):
    lexer = Lexer(input_data)
    parser = Parser.new(lexer)
    program = parser.parse()
    check_parse_errors(parser)

    output = eval(program)
    assert output.value == expected_val
Пример #5
0
def main(input):
	tokens = lexer.get_tokens(input)

	parser_tree = parser.parse(tokens)

	if type(parser_tree) == str:
		return parser_tree
	else:
		return evaluator.eval(parser_tree)
Пример #6
0
def test_if_else_expression(input_data, expected_output):
    lexer = Lexer(input_data)
    parser = Parser.new(lexer)
    program = parser.parse()
    check_parse_errors(parser)

    output = eval(program)
    if hasattr(output, "value"):
        assert output.value == expected_output
    else:
        assert str(output) == expected_output
Пример #7
0
def load(filename):
    """Load scheme source file `filename`
    """
    # The parameter fildname is of the form '"path"'
    filename = filename[1:-1]
    with open(filename) as file:
        input = file.read()
    result = []
    parseInput(input, 0, result)
    for exp in result:
        output = eval(exp, the_global_environment)
    return output
Пример #8
0
 def excute(self, env, name, value):
     if isinstance(name, t.List): # Function definition
         fname = name[0]
         args = name[1:]
         func = t.Function(args, value, env, fname)
         env[fname] = func
         return t.Null()
     elif isinstance(name, t.Symbol): # Value definition
         env[name] = ev.eval(value, env)
         return t.Null()
     else:
         raise SyntaxError('define')
Пример #9
0
def __main__():
    sys.stdout.write("-> ")
    sys.stdout.flush()
    for line in sys.stdin:
        try:
            ast = parser.parse(line)
            result = evaluator.eval(ast, global_env)
            print("# " + str(result))
            sys.stdout.write("-> ")
            sys.stdout.flush()
        except Exception as e:
            print(e)
            sys.stdout.write("-> ")
            sys.stdout.flush()
Пример #10
0
def repl(prompt='lispy> ', inport=p.InPort(sys.stdin), out=sys.stdout):
    """A prompt-read-eval-print loop.
    """
    sys.stderr.write("Lispy version 2.0\n")
    while True:
        try:
            if prompt:
                sys.stderr.write(prompt)
            x = p.parse(inport)
            if x is p.eof_object:
                return
            val = e.eval(x)
            if val is not None and out:
                print >> out, to_string(val)
        except Exception, ex:
            print '{0}: {1}'.format(type(ex).__name__, ex)
            break
Пример #11
0
def repl(prompt='lispy> ', inport=p.InPort(sys.stdin), out=sys.stdout):
    """A prompt-read-eval-print loop.
    """
    sys.stderr.write("Lispy version 2.0\n")
    while True:
        try:
            if prompt:
                sys.stderr.write(prompt)
            x = p.parse(inport)
            if x is p.eof_object:
                return
            val = e.eval(x)
            if val is not None and out:
                print >> out, to_string(val)
        except Exception, ex:
            print '{0}: {1}'.format(type(ex).__name__, ex)
            break
Пример #12
0
def repl(prompt="> "):
    while True:
        program = ''
        try:
            program = input(prompt)
        except (KeyboardInterrupt, EOFError):
            print()
            return
        if len(program) > 0:
            val = None
            try:
                val = eval(parse(program))
            except NameError as e:
                print(e)
                continue
            if val is not None:
                print(sexpr(val))
Пример #13
0
def start(env=None):
    if env is None:
        env = mobject.Environment()
    while True:
        print(PROMPT, end=" ")
        line = input()
        if not line:
            return

        l = lexer.Lexer(line)
        p = parser.Parser(l)

        program = p.parse_program()
        if len(p.errors) != 0:
            print_parser_errors(p.errors)
            continue

        evaluated = evaluator.eval(program, env)
        if evaluated is not None:
            print(evaluated.inspect)
Пример #14
0
def build_cnn(lang, odir):
    '''Train, valid, test CNN
    
        lang: The language name
        odir: output directory of prediction results
    '''
    doc_idx = 2
    max_len = 40  # sequence length
    epochs = 10

    encode_dir = './data/encode/' + lang + '/'
    indices_dir = './data/indices/' + lang + '/'
    wt_dir = './resources/weight/'
    res_dir = './resources/classifier/cnn/'

    clf_path = res_dir + lang + '.clf'
    # don't reload classifier for debug usage

    # load embedding weights
    weights = np.load(wt_dir + lang + '.npy')

    # build model architecture
    text_input = Input(shape=(max_len, ), dtype='int32', name='input')
    embeds = Embedding(weights.shape[0],
                       weights.shape[1],
                       weights=[weights],
                       input_length=max_len,
                       trainable=True,
                       name='embedding')(text_input)
    # convolution
    conv3 = Conv1D(kernel_size=3, filters=100, padding='same',
                   name='conv3')(embeds)
    maxp3 = MaxPool1D()(conv3)
    conv4 = Conv1D(kernel_size=4, filters=100, padding='same',
                   name='conv4')(embeds)
    maxp4 = MaxPool1D()(conv4)
    conv5 = Conv1D(kernel_size=5, filters=100, padding='same',
                   name='conv5')(embeds)
    maxp5 = MaxPool1D()(conv5)
    # merge
    merge_convs = keras.layers.concatenate([maxp3, maxp4, maxp5], axis=-1)
    # flatten
    flat_l = Flatten()(merge_convs)
    # dense, according to kim'14 paper,
    # regularizer applies to the both kernel and bias
    dense_l = Dense(
        100,
        activation='softplus',
        name='dense',
        kernel_regularizer=keras.regularizers.l1_l2(0, 0.03),
        bias_regularizer=keras.regularizers.l1_l2(0, 0.03),
    )(flat_l)
    dp_l = Dropout(0.3, name='dropout')(dense_l)
    # predict, binary prediction
    predicts = Dense(1, activation='sigmoid', name='predict')(dp_l)
    # model
    model = Model(inputs=text_input, outputs=predicts)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    print(model.summary())

    best_valid_f1 = 0.0
    best_model = None

    for e in range(epochs):
        accuracy = 0.0
        loss = 0.0
        step = 1
        print('--------------Epoch: {}--------------'.format(e))

        # load training and batch dataset
        train_iter = evaluator.data_iter(indices_dir + 'train.tsv',
                                         batch_size=64)

        # train model
        for class_wt, x_train, y_train in train_iter:
            if len(np.unique(y_train)) == 1:
                continue

            tmp = model.train_on_batch([x_train],
                                       y_train,
                                       class_weight=class_wt)

            loss += tmp[0]
            loss_avg = loss / step
            accuracy += tmp[1]
            accuracy_avg = accuracy / step
            if step % 30 == 0:
                print('Step: {}'.format(step))
                print('\tLoss: {}. Accuracy: {}'.format(
                    loss_avg, accuracy_avg))
                print('--------------------------------------')
                step += 1

        # valid model to find the best model
        print('---------------Validation------------')
        valid_iter = evaluator.data_iter(indices_dir + 'valid.tsv',
                                         batch_size=64,
                                         if_shuffle=False)
        y_preds = []
        y_valids = []

        for _, x_valid, y_valid in valid_iter:
            tmp_preds = model.predict([x_valid])

            for item_tmp in tmp_preds:
                y_preds.append(round(item_tmp[0]))
            y_valids.extend(y_valid)

        valid_f1 = f1_score(
            y_true=y_valids,
            y_pred=y_preds,
            average='weighted',
        )
        print('Validating f1-macro score: ' + str(valid_f1))

        if best_valid_f1 < valid_f1:
            best_valid_f1 = valid_f1
            best_model = model

            pickle.dump(best_model, open(clf_path, 'wb'))

            # test moddel
            print('--------------Test--------------------')
            y_preds = []
            y_probs = []

            test_iter = evaluator.data_iter(indices_dir + 'test.tsv',
                                            batch_size=64,
                                            if_shuffle=False)

            for _, x_test, y_test in test_iter:
                tmp_preds = best_model.predict([x_test])
                for item_tmp in tmp_preds:
                    y_probs.append(item_tmp[0])
                    y_preds.append(int(round(item_tmp[0])))

            with open(odir + lang + '.tsv', 'w') as wfile:
                with open(indices_dir + 'test.tsv') as dfile:
                    wfile.write(dfile.readline().strip() +
                                '\tpred\tpred_prob\n')
                    for idx, line in enumerate(dfile):
                        wfile.write(line.strip() + '\t' + str(y_preds[idx]) +
                                    '\t' + str(y_probs[idx]) + '\n')

            # save the predicted results
            evaluator.eval(odir + lang + '.tsv', odir + lang + '.score')
Пример #15
0
def build_bert(lang, odir, params=None):
    '''Google Bert Classifier
        lang: The language name
        odir: output directory of prediction results
    '''
    if not params:
        params = dict()
        params['balance_ratio'] = 0.9
        params['freeze'] = False
        params['decay_rate'] = .001
        params['lr'] = 2e-5
        params['warm_steps'] = 100
        params['train_steps'] = 1000
        params['batch_size'] = 16
        params['balance'] = True

    split_dir = './data/split/' + lang + '/'

    if torch.cuda.is_available():
        device = str(get_freer_gpu())
        torch.cuda.set_device(device)
    else:
        device = torch.device('cpu')
    print(device)

    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name())
    print('Number of GPUs: ', n_gpu)

    print('Loading Datasets and oversample training data...')
    train_df = pd.read_csv(split_dir + 'train.tsv', sep='\t', na_values='x')

    # oversample the minority class
    if params['balance']:
        label_count = Counter(train_df.label)
        for label_tmp in label_count:
            sample_num = label_count.most_common(
                1)[0][1] - label_count[label_tmp]
            if sample_num == 0:
                continue
            train_df = pd.concat([
                train_df, train_df[train_df.label == label_tmp].sample(
                    int(sample_num * params['balance_ratio']), replace=True)
            ])
        train_df = train_df.reset_index()  # to prevent index key error

        valid_df = pd.read_csv(split_dir + 'valid.tsv',
                               sep='\t',
                               na_values='x')
        test_df = pd.read_csv(split_dir + 'train.tsv', sep='\t', na_values='x')
        data_df = [train_df, valid_df, test_df]

    # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
    for doc_df in data_df:
        doc_df.text = doc_df.text.apply(lambda x: '[CLS] ' + x + ' [SEP]')

    if lang == 'English':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                  do_lower_case=True)
    elif lang == 'Chinese':
        tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                                  do_lower_case=True)
    else:
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-uncased', do_lower_case=True)

    print('Padding Datasets...')
    for doc_df in data_df:
        doc_df.text = doc_df.text.apply(lambda x: tokenizer.tokenize(x))

    # convert to indices and pad the sequences
    max_len = 25
    for doc_df in data_df:
        doc_df.text = doc_df.text.apply(lambda x: pad_sequences(
            [tokenizer.convert_tokens_to_ids(x)], maxlen=max_len, dtype="long")
                                        [0])

    # create attention masks
    for doc_df in data_df:
        attention_masks = []
        for seq in doc_df.text:
            seq_mask = [float(idx > 0) for idx in seq]
            attention_masks.append(seq_mask)
        doc_df['masks'] = attention_masks

    # format train, valid, test
    train_inputs = torch.tensor(data_df[0].text)
    train_labels = torch.tensor(data_df[0].label)
    train_masks = torch.tensor(data_df[0].masks)
    valid_inputs = torch.tensor(data_df[1].text)
    valid_labels = torch.tensor(data_df[1].label)
    valid_masks = torch.tensor(data_df[1].masks)
    test_inputs = torch.tensor(data_df[2].text)
    test_labels = torch.tensor(data_df[2].label)
    test_masks = torch.tensor(data_df[2].masks)

    batch_size = params['batch_size']

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=batch_size)
    valid_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
    valid_sampler = SequentialSampler(valid_data)
    valid_dataloader = DataLoader(valid_data,
                                  sampler=valid_sampler,
                                  batch_size=batch_size)
    test_data = TensorDataset(test_inputs, test_masks, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data,
                                 sampler=test_sampler,
                                 batch_size=batch_size)

    # load the pretrained model
    print('Loading Pretrained Model...')
    if lang == 'English':
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=2)
    elif lang == 'Chinese':
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-chinese', num_labels=2)
    else:  # for Spanish, Italian, Portuguese and Polish
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-multilingual-uncased', num_labels=2)
    model.to(device)

    # organize parameters
    param_optimizer = list(model.named_parameters())
    if params['freeze']:
        no_decay = ['bias', 'bert']  # , 'bert' freeze all bert parameters
    else:
        no_decay = ['bias']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        params['decay_rate']
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=params['lr'])
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params['warm_steps'],
        num_training_steps=params['train_steps'])

    # Number of training epochs (authors recommend between 2 and 4)
    epochs = 10

    # Training
    print('Training the model...')
    for _ in trange(epochs, desc='Epoch'):
        model.train()
        # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        # train batch
        for step, batch in enumerate(train_dataloader):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
            # backward pass
            #            outputs[0].backward()
            outputs.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()

            # Update tracking variables
            tr_loss += outputs[0].item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss / nb_tr_steps))
        '''Validation'''
        best_valid_f1 = 0.0
        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()
        # tracking variables
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # batch eval
        y_preds = []
        for batch in valid_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask)
            # Move logits and labels to CPU
            logits = outputs[0].detach().cpu().numpy()
            # record the prediction
            pred_flat = np.argmax(logits, axis=1).flatten()
            y_preds.extend(pred_flat)

            label_ids = b_labels.to('cpu').numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))

        # evaluate the validation f1 score
        f1_m_valid, f1_w_valid = flat_f1(y_preds, valid_df.label)
        if f1_m_valid > best_valid_f1:
            print('Test....')
            best_valid_f1 = f1_m_valid
            y_preds = []
            y_probs = []

            # test if valid gets better results
            for batch in test_dataloader:
                batch = tuple(t.to(device) for t in batch)
                b_input_ids, b_input_mask, b_labels = batch
                with torch.no_grad():
                    outputs = model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask)
                probs = F.softmax(outputs[0], dim=1)
                probs = probs.detach().cpu().numpy()
                pred_flat = np.argmax(probs, axis=1).flatten()
                y_preds.extend(pred_flat)
                y_probs.extend([item[1] for item in probs])
            # save the predicted results
            with open(odir + lang + '.tsv', 'w') as wfile:
                with open(split_dir + 'test.tsv') as dfile:
                    wfile.write(dfile.readline().strip() +
                                '\tpred\tpred_prob\n')
                    for idx, line in enumerate(dfile):
                        wfile.write(line.strip() + '\t' + str(y_preds[idx]) +
                                    '\t' + str(y_probs[idx]) + '\n')

            # save the predicted results
            evaluator.eval(odir + lang + '.tsv', odir + lang + '.score')
Пример #16
0
 def excute(self, env, first, *args):
     last = t.Null()
     for item in itertools.chain([first], args):
         last = ev.eval(item, env)
     return last
Пример #17
0
 def excute(self, env, state, true, false):
     if ev.eval(state, env):
         return ev.eval(true, env)
     else:
         return ev.eval(false, env)
Пример #18
0
def build_rnn(lang, odir):
    '''Train, valid, test RNN
    
        lang: The language name
        odir: output directory of prediction results
    '''
    doc_idx = 2
    rnn_size = 200
    max_len = 40  # sequence length
    epochs = 10

    encode_dir = './data/encode/' + lang + '/'
    indices_dir = './data/indices/' + lang + '/'
    wt_dir = './resources/weight/'
    res_dir = './resources/classifier/rnn/'

    clf_path = res_dir + lang + '.clf'
    # don't reload classifier for debug usage

    # load embedding weights
    weights = np.load(wt_dir + lang + '.npy')

    # build model architecture
    text_input = Input(shape=(max_len, ), dtype='int32', name='input')
    embeds = Embedding(weights.shape[0],
                       weights.shape[1],
                       weights=[weights],
                       input_length=max_len,
                       trainable=True,
                       name='embedding')(text_input)
    bigru = Bidirectional(GRU(rnn_size,
                              kernel_initializer="glorot_uniform"))(embeds)
    dp = Dropout(rate=.2)(bigru)
    predicts = Dense(1, activation='sigmoid',
                     name='predict')(dp)  # binary prediction

    model = Model(inputs=text_input, outputs=predicts)
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    print(model.summary())

    best_valid_f1 = 0.0
    best_model = None

    for e in range(epochs):
        accuracy = 0.0
        loss = 0.0
        step = 1
        print('--------------Epoch: {}--------------'.format(e))

        # load training and batch dataset
        train_iter = evaluator.data_iter(indices_dir + 'train.tsv',
                                         batch_size=64)

        # train model
        for class_wt, x_train, y_train in train_iter:
            if len(np.unique(y_train)) == 1:
                continue

            tmp = model.train_on_batch([x_train],
                                       y_train,
                                       class_weight=class_wt)

            loss += tmp[0]
            loss_avg = loss / step
            accuracy += tmp[1]
            accuracy_avg = accuracy / step
            if step % 30 == 0:
                print('Step: {}'.format(step))
                print('\tLoss: {}. Accuracy: {}'.format(
                    loss_avg, accuracy_avg))
                print('--------------------------------------')
                step += 1

        # valid model to find the best model
        print('---------------Validation------------')
        valid_iter = evaluator.data_iter(indices_dir + 'valid.tsv',
                                         batch_size=64,
                                         if_shuffle=False)
        y_preds = []
        y_valids = []

        for _, x_valid, y_valid in valid_iter:
            tmp_preds = model.predict([x_valid])

            for item_tmp in tmp_preds:
                y_preds.append(round(item_tmp[0]))
            y_valids.extend(y_valid)

        valid_f1 = f1_score(
            y_true=y_valids,
            y_pred=y_preds,
            average='weighted',
        )
        print('Validating f1-macro score: ' + str(valid_f1))

        if best_valid_f1 < valid_f1:
            best_valid_f1 = valid_f1
            best_model = model

            pickle.dump(best_model, open(clf_path, 'wb'))

            # test moddel
            print('--------------Test--------------------')
            y_preds = []
            y_probs = []

            test_iter = evaluator.data_iter(indices_dir + 'test.tsv',
                                            batch_size=64,
                                            if_shuffle=False)

            for _, x_test, y_test in test_iter:
                tmp_preds = model.predict([x_test])
                for item_tmp in tmp_preds:
                    y_probs.append(item_tmp[0])
                    y_preds.append(int(round(item_tmp[0])))

            with open(odir + lang + '.tsv', 'w') as wfile:
                with open(indices_dir + 'test.tsv') as dfile:
                    wfile.write(dfile.readline().strip() +
                                '\tpred\tpred_prob\n')
                    for idx, line in enumerate(dfile):
                        wfile.write(line.strip() + '\t' + str(y_preds[idx]) +
                                    '\t' + str(y_probs[idx]) + '\n')

            # save the predicted results
            evaluator.eval(odir + lang + '.tsv', odir + lang + '.score')
Пример #19
0
    filename = filename[1:-1]
    with open(filename) as file:
        input = file.read()
    result = []
    parseInput(input, 0, result)
    for exp in result:
        output = eval(exp, the_global_environment)
    return output


primitive_procedure_names.append('load')
primitive_procedure_objects.append(['primitive', load])
the_global_environment = Environment(primitive_procedure_names, primitive_procedure_objects)
load('"./init.scm"')

if __name__ == "__main__":
    if len(sys.argv) == 1:
        while True:
            input = input("Input: ")
            result = []
            parseInput(input, 0, result)
            for exp in result:
                print(formatOutput(eval(exp, the_global_environment)))
    else:
        with open(sys.argv[1]) as file:
            input = file.read()
        result = []
        parseInput(input, 0, result)
        for exp in result:
            print(formatOutput(eval(exp, the_global_environment)))
Пример #20
0
def build_lr(lang, odir):
    '''Train, valid, test lr
    
        lang: The language name
        odir: output directory of prediction results
    '''
    doc_idx = 2
    encode_dir = './data/encode/'+lang+'/'
    split_dir = './data/split/'+lang+'/'
    res_dir = './resources/classifier/lr/'
    
    vec_path = res_dir + lang + '.vect'
    clf_path = res_dir + lang + '.clf'
    # don't load classifier for debug usage

    print('Building vectorizer...')
    if os.path.exists(vec_path):
        vect = pickle.load(open(vec_path, 'rb'))
    else:
        corpus = []
        with open(encode_dir + 'corpus.tsv') as dfile:
            dfile.readline() # skip column names
            for line in dfile:
                line = line.strip().split('\t')
                corpus.append(line[doc_idx])

        vect = TfidfVectorizer(
            ngram_range=(1, 3), max_features=15000)
        vect.fit(corpus)
        pickle.dump(vect, open(vec_path, 'wb'))

    print('Building classifier...')
    # load training data
    data = {'x':[], 'y':[]}
    with open(split_dir+'train.tsv') as dfile:
        dfile.readline()
        for line in dfile:
            line = line.strip().split('\t')
            data['x'].append(line[doc_idx])
            data['y'].append(int(line[-1]))

    # calculate the weight of labels
    weights = dict(zip(
        np.unique(data['y']), compute_class_weight(
            'balanced', np.unique(data['y']), 
            data['y']
        )
    ))
    # shuffle the data before training
    data['x'], data['y'] = shuffle(data['x'], data['y'])

    # build classifier
    clf = LogisticRegression(
        class_weight=weights, solver='liblinear')
    clf.fit(vect.transform(data['x']), data['y'])
    # save the classifier
    pickle.dump(clf, open(clf_path, 'wb'))
    
    # test the classifier
    data = []
    with open(split_dir+'test.tsv') as dfile:
        dfile.readline()
        for line in dfile:
            line = line.strip().split('\t')
            data.append(line[doc_idx])

    data = vect.transform(data)
    y_preds = clf.predict(data)
    y_probs = clf.predict_proba(data)

    # save the test results
    with open(odir+lang+'.tsv', 'w') as wfile:
        with open(split_dir+'test.tsv') as dfile:
            wfile.write(
                dfile.readline().strip()+'\tpred\tpred_prob\n'
            )
            for idx, line in enumerate(dfile):
                # 1 is the hate speech label
                wfile.write(line.strip()+'\t'+str(y_preds[idx])+'\t'+str(y_probs[idx][1])+'\n')

    # save the predicted results
    evaluator.eval(
        odir+lang+'.tsv', 
        odir+lang+'.score'
    )
Пример #21
0
import preprocess_clueweb as p
import single_model_handler as mh
import evaluator as e
import params
import sys
if __name__ == "__main__":
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(params.data_set_file,
                                                       params.normalized)
    sys.stdout.flush()
    number_of_queries = len(set(queries))
    evaluator = e.eval()
    evaluator.create_index_to_doc_name_dict()
    evaluator.remove_score_file_from_last_run()
    sys.stdout.flush()
    train, validation = preprocess.create_test_train_split_cluweb(queries)
    sys.stdout.flush()
    X_i, y_i = preprocess.create_data_set(X[train], y[train], queries[train])
    sys.stdout.flush()
    C_array = [0.1, 0.01, 0.001]
    single_model_handler = mh.single_model_handler(C_array)

    single_model_handler.fit_model_on_train_set_and_choose_best_for_competition(
        X, y, X_i, y_i, validation, queries, evaluator, preprocess)
    print("learning is finished")
Пример #22
0
 def __call__(self, *args, **kwargs):
     import evaluator
     new_env = Environment(self.env)
     for s, v in zip(self.args, args):
         new_env[s] = v
     return evaluator.eval(self.body, new_env)