def convert_to_trainable_format(title, title_transform_func, feature_extractor, **kwargs): """ Given some title(before capitalization), return the trainable(for CRF-suite) format >>> from cap_transform import make_capitalized_title >>> from capitalization_restoration.feature_extractor import FeatureExtractor >>> extractor = FeatureExtractor() >>> sent = convert_to_trainable_format(u"Why oil prices will be 'robust' long-term: Shell CEO", make_capitalized_title, extractor, docpath="test_data/oil-price") >>> sent[2]["word"] u'Prices' >>> sent[5]["lower-in-dict"] False >>> sent[1]["y"] 'AL' """ if isinstance(title, list): words = title else: words = nltk.word_tokenize(title) transformed_words = title_transform_func(title_words = words) words_with_features = feature_extractor.extract(transformed_words, **kwargs) #add the labels for word_str, word_info in zip(words, words_with_features): word_info["y"] = get_label(word_str) return words_with_features
def convert_to_trainable_format(title, title_transform_func, feature_extractor, **kwargs): """ Given some title(before capitalization), return the trainable(for CRF-suite) format >>> from cap_transform import make_capitalized_title >>> from capitalization_restoration.feature_extractor import FeatureExtractor >>> extractor = FeatureExtractor() >>> sent = convert_to_trainable_format(u"Why oil prices will be 'robust' long-term: Shell CEO", make_capitalized_title, extractor, docpath="test_data/oil-price") >>> sent[2]["word"] u'Prices' >>> sent[5]["lower-in-dict"] False >>> sent[1]["y"] 'AL' """ if isinstance(title, list): words = title else: words = nltk.word_tokenize(title) transformed_words = title_transform_func(title_words=words) words_with_features = feature_extractor.extract(transformed_words, **kwargs) #add the labels for word_str, word_info in zip(words, words_with_features): word_info["y"] = get_label(word_str) return words_with_features
def get_lebel(): try: data = request.json userId = data.get(USERID) taskId = data.get(TASKID) taskType = data.get(TASKTYPE) labelCount = data.get("labelCount") if taskId == None: return return_failed_with_msg("No taskId") # return jsonify("No taskId") result_label = Label.get_label(userId, taskId, taskType, labelCount) if taskType == CLASSIFICATION: result = { TASKID: taskId, TASKTYPE: taskType, "labelList": result_label } print(result) return Respond.return_success_with_data(result) if taskType == NER: result = { TASKID: taskId, TASKTYPE: taskType, "label": result_label } return Respond.return_success_with_data(result) except: return Respond.return_failed()
def make_capitalized_title(title=None, title_words=None): trans_words = [] if title_words: words = title_words elif title: words = nltk.word_tokenize(title) else: raise ValueError("Receive nothing..") for i, word in enumerate(words): if get_label(word) == 'MX': trans_words.append(word) elif i == 0: trans_words.append( word if word[0] == word[0].upper() else word.capitalize() ) elif (word.lower() in ARTICLES or word.lower() in PREPOSITIONS or word.lower() in CONJUNCTIONS): trans_words.append(word.lower()) elif word[0] == word[0].upper(): # already capitalized trans_words.append(word) else: trans_words.append(word.capitalize()) return trans_words
def load_labeled_data(path): """ >>> d = load_labeled_data(path = "fnames_and_titles.txt") >>> d.next()[:8] [(u'The', 'IC'), (u'Sun', 'IC'), (u'Life', 'IC'), (u'Building', 'IC'), (u'receives', 'AL'), (u'LEED', 'AU'), (u'Silver', 'IC'), (u'Certification', 'IC')] """ with open(path, "r", "utf8") as f: for line in f: _, title = json.loads(line) words = nltk.word_tokenize(title) yield [(w, get_label(w)) for w in words]
else: test_label_begin_date = datetime(2017, 5, 1) test_label_end_date, test_feature_begin_dates, test_feature_end_date = get_dates(test_label_begin_date) train_label_begin_date = datetime(2017, 4, 1) train_label_end_date, train_feature_begin_dates, train_feature_end_date = get_dates(train_label_begin_date) ########## FEATURE EXTRACTION ########## # get training feature and label train_feature = get_feature(data, train_feature_begin_dates, train_feature_end_date, featured_month_periods) train_label = get_label(data, train_label_begin_date, train_label_end_date) # get test feature test_feature = get_feature(data, test_feature_begin_dates, test_feature_end_date, featured_month_periods) ########## MODEL TRAINING ########## x_train = train_feature.drop('user_id', axis=1) y_train = train_label.drop('user_id', axis=1) x_test = test_feature.drop('user_id', axis=1) model_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression',
acc, true_positives, real_positives, predicted_positives = utils.calc_acc_f1( label_all, pred_all, 0.5) fout = open('log.txt', 'a+', encoding='utf-8') fout.write('\n' + '*' * 20 + '\n') fout.write('acc:' + str(acc) + '\n') fout.write('true_positives:' + str(true_positives) + '\n') fout.write('real_positives:' + str(real_positives) + '\n') fout.write('predicted_positives:' + str(predicted_positives) + '\n') fout.close() # acc, true_positives, real_positives, predicted_positives = utils.calc_acc_f1(target, output, 0.5) print('f1:%.4f' % (f1)) if (args.command == "check"): filenameslist, filelabelslist = get_label() pred = np.zeros(filelabelslist.shape) if torch.cuda.is_available(): label_all = torch.Tensor().cuda() pred_all = torch.Tensor().cuda() else: label_all = torch.Tensor() pred_all = torch.Tensor() for i in range(5): config.train_data = 'path/train' args.fold = i target, output, filename = check(args) idx = [] for tmp_name in filename: idx.append(filenameslist.index(tmp_name)) pred[idx] += np.round(output.cpu().detach().numpy())