def _validate(self, criteria, toValidate): if None == criteria: PresentValidator._LOGGER.debug("None criteria, so returning True") return True elif None == toValidate: PresentValidator._LOGGER.debug("None toValidate, so returning False") return False elif not 'name' in criteria: PresentValidator._LOGGER.debug("'name' not in criteria, so returning False") return False elif None == DictUtils.defaultIfNone(toValidate, None, criteria['name']): PresentValidator._LOGGER.debug("criteria['name']: " + criteria['name'] + " not in toValidate, so returning False") return False elif not 'expected' in criteria: PresentValidator._LOGGER.debug("no expected found in criteria, so returning True") return True else: toValidateVal = DictUtils.defaultIfNone(toValidate, None, criteria['name']) if list == type(criteria['expected']): PresentValidator._LOGGER.debug("will match nested criteria later") return True elif str == type(criteria['expected']): PresentValidator._LOGGER.debug("matching string") return criteria['expected'] == toValidateVal elif type(criteria['expected']) != type(toValidateVal): PresentValidator._LOGGER.debug("type mismatch criteria[expected]:" + str(type(criteria['expected'])) + ", toValidate['" + criteria['name'] + "']: " + str(type(toValidateVal)) + ", so returning False") return False else: PresentValidator._LOGGER.debug("oops! validation failed") return False
def create_dicts(sentences): dict_q = dict() dict_e = dict() for sentence in sentences: DictUtils.insert_sentence_to_dicts(dict_q, dict_e, sentence) dict_e = DictUtils.convert_rare_words(dict_e) DictUtils.update_dict(dict_q, 'ALL', sum(dict_e.values())) return dict_q, dict_e
def main(input_file_name, q_mle, e_mle, greedy_hmm_output, extra_file_name): start = datetime.now() sentences = FileUtils.read_lines(input_file_name) dict_q = DictUtils.convert_line_to_dict(FileUtils.read_lines(q_mle)) dict_e = DictUtils.convert_line_to_dict(FileUtils.read_lines(e_mle)) unk_tag_list = DictUtils.possible_tags(UNK, dict_e) tagged_text = greedy(sentences, dict_q, dict_e, unk_tag_list) FileUtils.write_tagged_text(greedy_hmm_output, tagged_text) end = datetime.now() print('Running Time: {0}'.format(end - start))
def main(input_file_name, q_mle, e_mle, hmm_viterbi_predictions, extra_file_name): start = datetime.now() sentences = FileUtils.read_lines(input_file_name) dict_q = DictUtils.convert_line_to_dict(FileUtils.read_lines(q_mle)) dict_e = DictUtils.convert_line_to_dict(FileUtils.read_lines(e_mle)) unk_tag_list = DictUtils.possible_tags('*UNK*', dict_e) tagged_text = viterbi(sentences, dict_q, dict_e, unk_tag_list) FileUtils.write_tagged_text(hmm_viterbi_predictions, tagged_text) end = datetime.now() print('Running Time: {0}'.format(end - start))
def memm_greedy(sentences, max_sentence_len, features_map, counters_dict, clf): all_words_features = list() sentences_predictions = list() for j in range(len(sentences)): sentences_predictions.append(list()) for i in range(max_sentence_len): start = datetime.now() for j in range(len(sentences)): sentence = sentences[j] if i < len(sentence): word = sentence[i] word_features = FeaturesUtils.get_word_features( i, sentence, sentences_predictions[j], DictUtils.is_rare(counters_dict, word)) all_words_features.append(word_features) else: all_words_features.append(dict()) all_prediction_word_i = get_prediction_of_all_words( all_words_features, clf, features_map) end = datetime.now() print('word i={1} Running Time: {0}'.format(end - start, i)) for j in range(len(all_prediction_word_i)): sentences_predictions[j].append(all_prediction_word_i[j]) all_words_features = list() return sentences_predictions
def validate(criteria, toValidate): ValidatorFactory._LOGGER.debug("criteria: " + str(criteria)) ValidatorFactory._LOGGER.debug("toValidate: " + str(toValidate)) if None == criteria: ValidatorFactory._LOGGER.debug("None criteria, so returning True") return True if not 'check' in criteria: ValidatorFactory._LOGGER.debug("'check' not in criteria, so returning True, criteria: " + str(criteria)) return True validator = ValidatorFactory.getValidator(criteria['check']) if None == validator: ValidatorFactory._LOGGER.error("None validator found for, so returning False, check: " + criteria['check']) return False isValid = validator.validate(criteria, toValidate) ValidatorFactory._LOGGER.debug("isValid: " + str(isValid)) if not isValid: return False if isinstance(validator, PresentValidator) and 'expected' in criteria and list == type(criteria['expected']): ValidatorFactory._LOGGER.debug("will match nested criteria now ...") for nextCriteria in criteria['expected']: ValidatorFactory._LOGGER.debug("nextCriteria:" + str(nextCriteria)) if not 'check' in nextCriteria: continue if not 'name' in nextCriteria: ValidatorFactory._LOGGER.error("no name property found for check: " + str(nextCriteria)) return False nextToValidate = DictUtils.defaultIfNone(toValidate, None, criteria['name']) ValidatorFactory._LOGGER.debug("nextToValidate:" + str(nextToValidate)) isCheckValid = ValidatorFactory.validate(nextCriteria, nextToValidate) if not isCheckValid: return False return True
def create_features(words_features_list, words, tags, dict_e): for i in range(len(words)): is_rare = DictUtils.is_rare(dict_e, (words[i], tags[i])) word_feature_dict = FeaturesUtils.get_word_features( i, words, tags, is_rare) word_feature_dict['tag'] = tags[i] words_features_list.append(word_feature_dict) print(word_feature_dict)
def get_word_signatures_tag(word, dict_e, unk_tag_list): signatures = WordSignatures.get_word_signatures(word) if signatures == [word.lower()]: return {UNK: unk_tag_list} else: signatures_tags = dict() for signature in signatures: signatures_tags[signature] = DictUtils.possible_tags(signature, dict_e) return signatures_tags
def convert_line_to_lists(line): words = list() tags = list() for tuple in line.split(' '): word, tag = DictUtils.split_tuple(tuple) words.append(word) tags.append(tag) return words, tags
def main(features_file, model_file, feature_map_file): start = datetime.now() all_features, labels = FileUtils.read_features(features_file) counters_dict, word_tag_dict, unk_tad_dict = DictUtils.extract_features( all_features, labels) transform_of_features, features_map, model = create_features_format( all_features, labels) FileUtils.write_feature_map(feature_map_file, features_map, counters_dict) FileUtils.write_logistic_regression_model(model_file, model) end = datetime.now() print('Running Time: {0}'.format(end - start))
def get_word_features_list(i, words, prev_predictions, prev_prev_predictions, counters_dict): all_word_features = list() prev_list = list() for prev_prediction in prev_predictions: for prev_prev_prediction in prev_prev_predictions: word_features = FeaturesUtils.get_word_features( i, words, get_tag_list(i, prev_prediction, prev_prev_prediction), DictUtils.is_rare(counters_dict, words[i])) all_word_features.append(word_features) prev_list.append((prev_prediction, prev_prev_prediction)) return all_word_features, prev_list
def run(self): Tester.__LOGGER.info("in run") if not 'steps' in self.__config: Tester.__LOGGER.info("no test steps to execute") return default = DictUtils.defaultIfNone(self.__config, None, 'default') control = {'loop':{'running': False, 'count': 0, 'steps': []}, 'session':{'running': False, 'steps': {}}, 'result':{'total':{'count':0, 'time':0}, 'passed':{'count':0, 'time':0}, 'failed':{'count':0, 'time':0}, 'steps':{} } } for step in self.__config['steps']: if False == self.__isValidStep(step): continue executor = ExecutorFactory.getExecutor(step['construct']) if None == executor: Tester.__LOGGER.error("no executor found for construct: " + step['construct']) continue executor.execute(default, step, control) if isinstance(executor, EndLoopExecutor): while control['loop']['running']: for tStep in control['loop']['steps']: tStep['executor'].execute(default, tStep['step'], control) executor.execute(default, step, control) Tester.__LOGGER.info("================================") Tester.__LOGGER.info("[SUMMARY JSON]") Tester.__LOGGER.info(str(control['result'])) Tester.__LOGGER.info("================================") Tester.__LOGGER.info("================================") Tester.__LOGGER.info("[SUMMARY]") Tester.__LOGGER.info(self.__formatResultSeparator()) Tester.__LOGGER.info(self.__formatResultHead1()) Tester.__LOGGER.info(self.__formatResultSeparator()) Tester.__LOGGER.info(self.__formatResultHead2()) Tester.__LOGGER.info(self.__formatResultSeparator()) for step in self.__config['steps']: if not 'sid' in step: continue sid = step['sid'] sidData = control['result']['steps'][sid] Tester.__LOGGER.info(self.__formatResultStr(sid, sidData)) Tester.__LOGGER.info(self.__formatResultSeparator()) Tester.__LOGGER.info(self.__formatResultStr('OVERALL', control['result'])) Tester.__LOGGER.info(self.__formatResultSeparator()) Tester.__LOGGER.info("================================")
def main(input_file_name, model_file_name, feature_map_file, output_file_name): start = datetime.now() clf, vec = FileUtils.read_logistic_regression_model(model_file_name) classes = clf.classes_.tolist() sentences = FileUtils.read_lines(input_file_name) feature_map_lines = FileUtils.read_lines(feature_map_file) features_map, counters_dict = DictUtils.create_features_dicts( feature_map_lines) tagged_text = viterbi(sentences, features_map, counters_dict, clf, classes) FileUtils.write_tagged_text(output_file_name, tagged_text) end = datetime.now() print('Running Time: {0}'.format(end - start))
def _validate(self, criteria, toValidate): if None == criteria: ExactValidator._LOGGER.debug("None criteria, so returning True") return True elif None == toValidate: ExactValidator._LOGGER.debug("None toValidate, so returning False") return False elif not 'name' in criteria: ExactValidator._LOGGER.debug("'name' not in criteria, so returning False") return False elif None == DictUtils.defaultIfNone(toValidate, None, criteria['name']): ExactValidator._LOGGER.debug("criteria['name']: " + criteria['name'] + " not in toValidate, so returning False") return False elif not 'expected' in criteria: ExactValidator._LOGGER.debug("no expected found in criteria, so returning True") return True else: toValidateVal = DictUtils.defaultIfNone(toValidate, None, criteria['name']) if type(criteria['expected']) != type(toValidateVal): ExactValidator._LOGGER.debug("type mismatch, so returning True - type(criteria['expected']): " + str(type(criteria['expected'])) + ", type(toValidate['" + criteria['name'] + "']): " + str(type(toValidateVal))) return False else: ExactValidator._LOGGER.debug("matching string criteria['expected']: " + str(criteria['expected']) + ", toValidate['" + criteria['name'] + "']: " + str(toValidateVal)) return criteria['expected'] == toValidateVal
def main(input_file_name, model_file_name, feature_map_file, output_file_name): start = datetime.now() clf, vec = FileUtils.read_logistic_regression_model(model_file_name) sentences, max_sentence_len = FileUtils.read_sentences(input_file_name) feature_map_lines = FileUtils.read_lines(feature_map_file) features_map, counters_dict = DictUtils.create_features_dicts( feature_map_lines) sentences_predictions = memm_greedy(sentences, max_sentence_len, features_map, counters_dict, clf) FileUtils.write_prediction(output_file_name, sentences, sentences_predictions) end = datetime.now() print('Running Time: {0}'.format(end - start))
def compute_q(dict_q, tag, prev_tag, prev_prev_tag, lr1=0.90, lr2=0.09, lr3=0.01): if lr1 + lr2 + lr3 != 1.0: raise Exception('summing factors should be 1 !!!') prob1 = MathUtils.calc_fraction(DictUtils.get_value(dict_q, (tag, prev_tag, prev_prev_tag)), DictUtils.get_value(dict_q, (prev_tag, prev_prev_tag))) prob2 = MathUtils.calc_fraction(DictUtils.get_value(dict_q, (tag, prev_tag)), DictUtils.get_value(dict_q, prev_tag)) prob3 = MathUtils.calc_fraction(DictUtils.get_value(dict_q, tag), DictUtils.get_value(dict_q, 'ALL')) return lr1 * prob1 + lr2 * prob2 + lr3 * prob3
response = "" else: isSuccess = True statusCode = res.getcode() response = res.read() endTime = time.time() timeTaken = (endTime - startTime) TestExecutor._LOGGER.info("statusCode: " + str(statusCode)) TestExecutor._LOGGER.info("response: " + str(response)) try: jsonRes = json.loads(response) TestExecutor._LOGGER.debug("jsonRes: " + str(jsonRes)) responseDict = DictUtils.convert(jsonRes) TestExecutor._LOGGER.debug("responseDict: " + str(responseDict)) except ValueError, e: TestExecutor._LOGGER.debug("caught exception e:" + str(e)) responseDict = None except TypeError, e: TestExecutor._LOGGER.debug("caught exception e:" + str(e)) responseDict = None if control['session']['running']: control['session']['steps'].update({sid: {'IN':inputData, 'SC':statusCode, 'OUT':responseDict}}) TestExecutor._LOGGER.debug("updated session: " + str(control['session'])) if not isSuccess: self.__recordHit(control, sid, timeTaken, False) return
def possible_tags(word, dict_e, unk_tag_list): words_tags = DictUtils.possible_tags(word, dict_e) if len(words_tags) == 0: return get_word_signatures_tag(word, dict_e, unk_tag_list) else: return {word: words_tags}
def compute_e(word, tag, dict_q, dict_e): counter = DictUtils.get_value(dict_e, (word, tag)) denominator = DictUtils.get_value(dict_q, tag) return MathUtils.calc_fraction(counter, denominator)
def __init__(self, configFilePath): Tester.__LOGGER.debug("created Tester") with open(configFilePath, 'r') as configFile: self.__config = DictUtils.convert(json.load(configFile)) Tester.__LOGGER.debug("loaded config from: " + configFilePath)
def _execute(self, default, step, control): if control['loop']['running'] and 0 == control['loop']['count']: control['loop']['steps'].append({'step':step, 'executor':self}) sid = DictUtils.defaultIfNone(step, None, 'sid') if None == sid: TestExecutor._LOGGER.error("missing id for step: " + str(step)) sys.exit(1) host = DictUtils.defaultIfNone(step, default, 'host') path = DictUtils.defaultIfNone(step, default, 'path') method = DictUtils.defaultIfNone(step, default, 'method') commonInputData = DictUtils.defaultIfNone(None, default, 'input') inputData = DictUtils.defaultIfNone(step, default, 'input') if None != path: if path.startswith('/'): url = host + path else: url = host + '/' + path else: url = host url = self.__detemplatizeStr(url, control) TestExecutor._LOGGER.debug("url: " + url) if None == inputData: inputData = commonInputData elif None != commonInputData: inputData.update(commonInputData) if None != inputData: inputData = self.__detemplatize(inputData, control, boolToStr=True) # data = DictUtils.recursiveUrlencode(inputData) else: inputData = dict() TestExecutor._LOGGER.debug("request inputData: " + str(inputData)) startTime = time.time() try: if 'POST' == method: #res = urllib2.urlopen(url, data) files = self.__extractFiles(inputData) data, headers = DictUtils.encode_multipart(inputData, files) if None == files or 0 == len(files): TestExecutor._LOGGER.debug("request data: " + str(data)) else: TestExecutor._LOGGER.debug("request data: SOME POST DATA with files (won't log)") req = urllib2.Request(url, data=data, headers=headers) res = urllib2.urlopen(req) else: data = DictUtils.recursiveUrlencode(inputData) TestExecutor._LOGGER.debug("request data: " + data) url += "?" + data res = urllib2.urlopen(url) except IOError, e: TestExecutor._LOGGER.debug("caught exception e:" + str(e)) isSuccess = False if hasattr(e, 'code'): statusCode = e.code else: statusCode = 500 if hasattr(e, 'reason'): response = e.reason else: response = ""