예제 #1
0
 def get_blank_answer(self, key):
     answer = self.data_df.loc[key, 'answer']
     answer_idx = self.word2idx[data_util.clean_str(answer).split()[0]]
     voc_size = self.word_matrix.shape[0]
     onehot_answer = np.zeros(voc_size)
     onehot_answer[answer_idx] = 1
     return onehot_answer
예제 #2
0
def _classify(name, classifier, verbose=args.verbose):
    _name = _gender_features(clean_str(name))
    dist = classifier.prob_classify(_name)
    m, f = dist.prob("male"), dist.prob("female")
    d = {m: "male", f: "female"}
    prob = max(m, f)
    guess = d[prob]
    if verbose: print("%s -> %s (%.2f%%)" % (name, guess, prob * 100))
    return guess, prob
예제 #3
0
 def split_sentence_into_words(self, sentence):
     '''
     Split the given sentence (str) and enumerate the words as strs.
     Each word is normalized, i.e. lower-cased, non-alphabet characters
     like period (.) or comma (,) are stripped.
     When tokenizing, I use ``data_util.clean_str``
     '''
     try:
         words = data_util.clean_str(sentence).split()
     except:
         print sentence
         sys.exit()
     for w in words:
         if not w:
             continue
         yield w
예제 #4
0
def load_visualized_post(sequence_length, vocabulary):
    """
    This function loads the example posts in different format
    :param sequence_length: sequence length
    :param vocabulary: vocabulary dictionary
    :return: x is the vector of padded post mapped from vocabulary,
            sentence refers to original posts in string format,
            new_sentence refers to tokenized post
    """
    sentence = open('visual_example/image.txt').read()
    new_sentence = clean_str(sentence.replace('\n', '').lower()).split(' ')
    num_padding = sequence_length - len(new_sentence)
    padding_word = "<PAD/>"
    padded_sentence = new_sentence + [padding_word] * num_padding
    x = [vocabulary[sent] for sent in padded_sentence]
    x = [np.array([x, ]), 1.]       # reshape x
    return x, sentence, new_sentence
예제 #5
0
def read_names(filename=args.infile):
    with open(filename, 'r') as f:
        names = [clean_str(line.rstrip('\n')) for line in f]
    print("Loaded %d names from %s" % (len(names), filename))
    return names
def train_model(pattern,func):
    """
    This function trains and evaluates specified linear model for an edit type of data
    :param pattern: edit type
    :param func: linear function
    :return: None
    """
    # Load data from files
    positive_examples=[]
    negative_examples=[]
    with open('../data/' + pattern + "pos.txt", "r", encoding='latin-1') as f:
        for line in f:
            positive_examples.append(line.strip())
    with open('../data/' + pattern + "neg.txt", "r", encoding='latin-1') as f:
        for line in f:
            negative_examples.append(line.strip())
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    y = np.concatenate([[1 for _ in positive_examples], [0 for _ in negative_examples]], 0)
    f.close()

    # data preprocessing
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(x_text))

    # training
    batch_size= 200
    epoch = 10

    D = SGDClassifier(loss = func, warm_start=True)
    for ep in range(epoch):
        for batch in range(len(y)//batch_size):
            if batch == 0:
                tfidf1 = np.concatenate(
                    [tfidf[batch * batch_size:(batch + 1) * batch_size].toarray(), tfidf[-(batch + 1) * batch_size:].toarray()], 0)
                X_train1, X_test, y_train1, y_test = train_test_split(tfidf1, np.concatenate(
                    [y[batch * batch_size:(batch + 1) * batch_size], y[-(batch + 1) * batch_size:]], 0), test_size=0.1,
                                                                      random_state=42)
                D.partial_fit(X_train1, y_train1,classes=np.unique(y_train1))
            else:
                tfidf1 = np.concatenate([tfidf[batch * batch_size:(batch + 1) * batch_size].toarray(),
                                         tfidf[-(batch + 1) * batch_size:-batch * batch_size].toarray()],0)
                X_train, X_test, y_train, y_test = train_test_split(tfidf1,
                                        np.concatenate([y[batch * batch_size:(batch + 1) * batch_size],
                                        y[-(batch + 1) * batch_size:-batch * batch_size]],0), test_size=0.1, random_state=42)
                D.partial_fit(X_train, y_train)
            Dpred = D.predict(X_train1)
            k = 0
            for i in range(len(Dpred)):
                if (Dpred[i] >= 0.5 and y_train1[i] == 1) or (Dpred[i] < 0.5 and y_train1[i] == 0):
                    k += 1
            print ('epoch '+ str(ep + 1) + ': ' + str(int((float(batch) / (len(y)//batch_size)) * 100)) + '% accuracy: ' +
                   str(format(float(k)/len(X_train1), '.4f')))

            # calculate accuracy,precision and recall
            if batch % 10 == 0 and batch // 10 > 1:
                k = 0
                tp = 0
                fp = 0
                fn = 0
                test_size = 0
                Dpred = D.predict(X_test)
                test_size += len(X_test)
                for i in range(len(Dpred)):
                    if (Dpred[i] >= 0.5 and y_test[i] == 1) or (Dpred[i] < 0.5 and y_test[i] == 0):
                        k += 1
                    if Dpred[i] >= 0.5 and y_test[i] == 1: tp += 1
                    if Dpred[i] >= 0.5 and y_test[i] == 0: fp += 1
                    if Dpred[i] < 0.5 and y_test[i] == 1: fn += 1
                precision = format(float(tp) / (tp + fp), '.4f')
                recall = format(float(tp) / (tp + fn), '.4f')
                print('test accuracy:' + str(format(float(k) / test_size, '.4f')))
                print('test precision:' + str(precision))
                print('test recall:' + str(recall))

        k = 0
        tp = 0
        fp = 0
        fn = 0
        test_size = 0
        Dpred = D.predict(X_test)
        test_size += len(X_test)
        for i in range(len(Dpred)):
            if (Dpred[i] >= 0.5 and y_test[i] == 1) or (Dpred[i] < 0.5 and y_test[i] == 0):
                k += 1
            if Dpred[i] >= 0.5 and y_test[i] == 1: tp += 1
            if Dpred[i] >= 0.5 and y_test[i] == 0: fp += 1
            if Dpred[i] < 0.5 and y_test[i] == 1: fn += 1
        precision = format(float(tp) / (tp+fp), '.4f')
        recall = format(float(tp) / (tp+fn), '.4f')
        print('test accuracy:' + str(format(float(k) / test_size, '.4f')))
        print('test precision:' + str(precision))
        print('test recall:' + str(recall))