def get_concepts(self): """ Return all concepts (unpublished blogs). Returns: list: List of Concept objects. """ if not self.has_blog: raise ValueError("User doesn't have blog!") self.login() # get the f*****g untagged part of the site, where the links to the # concepts are stored data = self._get(self.blog_url) if '<div class="s_nadpis">Rozepsané zápisy</div>' not in data: return [] data = data.split('<div class="s_nadpis">Rozepsané zápisy</div>')[1] dom = dhtmlparser.parseString(data) concept_list = dom.find("div", {"class": "s_sekce"})[0] # links to concepts are stored in <li> concepts = [] for li in concept_list.find("li"): a = li.find("a")[0] concepts.append( Concept(title=a.getContent().strip(), link=a.params["href"], session=self.session)) return concepts
def _tagging(self, sentence, region_start, region_end, premodifier: 'e.g. a number, like 至少 of "至少22k"', posttoken: 'e.g. a , like 22k of "至少22k"'): entity = sentence[region_start : region_end] entity = "".join(entity) # handled class: the derived class itself derived_class = self.__class__ concept_values = { self.get_premodifier_label(): premodifier, self.get_posttoken_label(): posttoken } # creates a concept to wrap the above info concept = Concept( region_start, region_end, entity, derived_class, concept_values) concept.sign(_Premodifier) self._on_create_concept(sentence, concept) sentence.add_concept(concept) self._on_add_concept(sentence, concept)
def integrate(self, concept: Concept) -> Concept: integrated = self.get_concept(concept) if integrated is None: ip = [] # type: list[Concept] for p in concept.parents: ip.append(self.integrate(p)) integrated = Concept(concept.name, concept.relation, ip, concept.probability) integrated.store = self integrated.register_with_parents() self.add_concept(integrated) if integrated.relation == Relation.Word: word = integrated.name if word not in self._words: w = Word(word) w.add_meaning(integrated) self._words[word] = w elif integrated.relation == Relation.Implication: integrated.propagate_probability_from_parent( integrated.parents[0]) else: integrated.merge_probability(concept.probability) return integrated
def _tagging_number_and_unit(self, sentence, region_start, region_end, number_concept: 'a number, like 22 of 22k', synonym: 'a unit, like k of 22k'): entity = sentence[region_start:region_end] entity = "".join(entity) # handled class derived_class = RealNumber #self.__class__ # 22k = 22 x 1000 = 22000 number = number_concept.concept_values['value'] value = number * self.get_unit_size() concept_values = { 'number': number, 'metric_prefix': synonym, 'value': value } # creates a concept to wrap the above info concept = Concept(region_start, region_end, entity, derived_class, concept_values) concept.sign(_MetricPrefixUnit) self._on_create_concept(sentence, concept) sentence.add_concept(concept) self._on_add_concept(sentence, concept)
def parse_compound_concept(expression: str, store: Store, label: str=None, probability: float=1.0): s = expression.find('(') e = expression.rfind(')') rel = Relation.find(expression[:s]) args = [] # type: list[str] d = 0 arg_start = s+1 for x in range(s+1, e): c = expression[x] if c == '(': d += 1 elif c == ')': d -= 1 elif c == ',': if d == 0: args.append(expression[arg_start:x]) arg_start = x+1 args.append(expression[arg_start:e]) arg_concepts = [] # type: list[Concept] arg_probability = probability if rel == Relation.Implication: arg_probability = .5 for a in args: arg = parse(a.strip(), store, probability=arg_probability) arg_concepts.append(arg) if label is None and e < len(expression)-1: label = expression[e+1:].strip() return Concept(label, rel, arg_concepts, probability=probability)
def _create_concept(self, region_start, region_end, entity, concept_type, concept_values) -> Concept: ''' Creates a concept to wrap the above info. If the concept is None, it means the concept is rejected. ''' return Concept(region_start, region_end, entity, concept_type, concept_values)
def add_document(self, doc): """ Converts document to concept @param doc: should have doc_id, rev_id, title and clean_text """ _log.debug("addind document {0}".format(doc.title)) cid = self._generate_concept_id(doc) word_list = self.stemmer.process_text(doc.clean_text) new_concept = Concept(cid, doc.title, word_list, doc.rev_id) self.concepts_list.append(new_concept) self.ids.add(doc.id)
def parse_sentence(self, root) -> Iterable: advcl = self.dep(root, 'advcl') mark = self.dep(advcl, 'mark') if advcl and mark and mark.lemma_ == 'if': condition = next(self.parse_simple(advcl)) for action in self.parse_simple(root): yield Concept(None, Relation.Implication, [condition, action]) else: for c in self.parse_simple(root): yield c
def test_uses_concepts_in_parent(self): parent = Store() child = Store(parent) raw = Concept( None, Relation.Class, [Concept.word('c1'), Concept.word('c2')]) in_parent = parent.integrate(raw) in_child = child.integrate(raw) self.assertEqual(id(in_parent), id(in_child))
def create_virtual_concept(self): ''' Creates a virtual concept for some adjectives. ''' entity = self.get_synonym_list()[0] return Concept(start=-1, end=-1, entity=entity, concept_type=self.__class__, concept_values={'unit': entity})
def _tag(self, sentence, index=0): region_start = None region_end = None at = length = 0 for i in range(index, sentence.length()): # finds the metric-prefix unit synonym = self.match_prefix_synonym_at(sentence, i) # all of synonyms are concrate concepts, not abstract if synonym == None: continue # end-of-if # tags the unit itself self._tagging_unit(sentence, region_start=i, region_end=i + len(synonym), synonym=synonym) # parses the number part if existed at = i - 1 at = self._skip_whitespaces_reversely(sentence, at) if at < 0: # a metric-prefix unit without the number part # such as '萬一', '千二', '百九' # the default numbber is 1 number_concept = Concept( start=i, end=i, # a dummy number entity='1', concept_type=IntegerNumber, concept_values={'value': 1}) else: # a metric-prefix unit with the number part # such as '一萬一', '一千二', '一百九' number_concept = sentence.get_suffix_dominated_concept( at, IntegerNumber, RealNumber) if number_concept == None: continue # end-of-if # end-of-if # tags the [number unit] self._tagging_number_and_unit(sentence, region_start=number_concept.start, region_end=i + len(synonym), number_concept=number_concept, synonym=synonym)
def subj(self, sentence) -> Iterable[Concept]: s = self.dep(sentence, 'nsubj') while s is not None: p = self.dep(s, 'poss') if p is None: yield Concept.word(self.name(s)) else: yield Concept( None, Relation.Part, [Concept.word(self.name(p)), Concept.word(self.name(s))]) s = self.dep(s, 'conj')
def __create_lattice__(self): if self.dataframe is None: print( "Load the dataframe first\n__add_dataframe__(<type pandas.DataFrame>)" ) return None # will contain a list of concepts (extent, intent) concepts = list() extents = self.__get_extent_list__() for ex in extents: # print(ex, end=" --> ") # print(self.__get_intent__(ex)) concepts.append(Concept(set(ex), self.__get_intent__(ex))) return concepts
def __init__(self, sentence): ''' Sample: sentence: "我要17.3吋筆電" sequence: '17.3吋' - start: 2 - end: 7 (excluded) - entity: '17.3吋' - concept_type: NumberInchUnit - concept_values: {'value': '17.3', 'unit': '吋'} ''' super(Sentence, self).__init__(sentence) # holds length, sentence self.__length = len(sentence) self.__sentence = sentence # initializes the prefix/suffux concept list self.__prefix_concept_list = [None] * self.__length self.__suffix_concept_list = [None] * self.__length for idx in range(self.__length): start = idx end = idx + 1 # excluded (i.e. not included) entity = self[idx] # i.e. the char itself concept_type = None # no concept self.__prefix_concept_list[idx] \ = [Concept(start, end, entity, concept_type)] # note: index = end - 1 self.__suffix_concept_list[idx] \ = [Concept(start, end, entity, concept_type)] # end-of-for # used to save taggers which have already been executed self.__tagger_set = set()
def _tag(self, sentence, index=0): region_start = index region_end = None ch = None for i in range(index, sentence.length()): ch = sentence[i] # normalize # '0'(=65296) -> '0' # ... # '9'(=65305) -> '9' if '0' <= ch and ch <= '9': ch = chr(ord(ch) - ord('0') + ord('0')) # update back to the source sentence[i] = ch # end-of-if if '0' <= ch and ch <= '9': continue else: region_end = i break # end-of-if # end-of-for if region_end == None: region_end = sentence.length() # end-of-if if region_end > region_start: entity = sentence[region_start:region_end] entity = "".join(entity) concept_values = {'value': int(entity)} concept = Concept(region_start, region_end, entity, self.__CLASS, concept_values) sentence.add_concept(concept) # end-of-if # not meet the ending, continue to tag? if region_end < sentence.length(): if region_start == region_end: # not found the entity self._tag(sentence, region_end + 1) else: # found the entity self._tag(sentence, region_end)
def _tagging1(self, sentence, region_start, region_end, synonym): entity = sentence[region_start:region_end] entity = "".join(entity) derived_class = self.__class__ concept_values = {'operator': self.get_formal_operator()} # creates a concept to wrap the above info concept = Concept(region_start, region_end, entity, derived_class, concept_values) self._on_create_concept(sentence, concept) sentence.add_concept(concept) self._on_add_concept(sentence, concept)
def _tag(self, sentence, index=0): region_start = None region_end = None ch = None for i in range(index, sentence.length()): concept_list = sentence.get_prefix_concept_list(i, IntegerNumber) if len(concept_list) == 0: continue # end-of-if for integer_concept in concept_list: at = integer_concept.end if at >= sentence.length(): break # end-of-if ch = sentence[at] if ch != '.': continue else: at += 1 # end-of-if # found a floating point decimal_concept_list = sentence.get_prefix_concept_list( at, IntegerNumber) if len(decimal_concept_list) == 0: continue else: decimal_concept = decimal_concept_list[0] # end-of-if region_start = integer_concept.start region_end = decimal_concept.end entity = sentence[region_start:region_end] entity = "".join(entity) concept_values = {'value': float(entity)} sentence.remove_concept(integer_concept) sentence.remove_concept(decimal_concept) concept = Concept(region_start, region_end, entity, self.__CLASS, concept_values) sentence.add_concept(concept)
def create_answer(question: Concept, mapping: Mapping[Concept, Concept]) -> Concept: if mapping is None: return None # simple if question.is_simple(): if question in mapping: return mapping[question] return question # compound ap = [] # type: list[Concept] for p in question.parents: ap.append(NLU.create_answer(p, mapping)) return Concept(question.name, question.relation, ap)
def _tagging_unit(self, sentence, region_start, region_end, synonym: 'just the unit itself'): entity = sentence[region_start:region_end] entity = "".join(entity) # handled class derived_class = self.__class__ concept_values = {'unit': synonym, 'value': self.get_unit_size()} # creates a concept to wrap the above info concept = Concept(region_start, region_end, entity, derived_class, concept_values) concept.sign(_MetricPrefixUnit) #self._on_create_concept(sentence, concept) sentence.add_concept(concept)
def _materialize_node(self, concept_name): """ Materialize a node, returns the witness constant """ concept = self.domain.concepts[concept_name] assert concept.arity == 1 sort = concept.variables[0].sort assert sort != TopSort() witnesses = self._get_witnesses(concept_name) if len(witnesses) > 0: c = witnesses[0] else: c = Const(self._fresh_const_name(), sort) # TODO: maybe we shouldn't split here, and create the concepts explicitly X = Var('X', c.sort) name = '={}'.format(c.name) self.domain.concepts[name] = Concept(name, [X], Eq(X, c)) self.domain.split(concept_name, name) self.suppose(concept(c)) return c
def get_projections(self, node): """ Return a list of (name, binary_concept) with all possible projections at node """ witnesses = self._get_witnesses(node) if len(witnesses) == 0: return [] w = witnesses[0] result = [] n_concept = self.domain.concepts[node] for t_name in self.domain.concepts_by_arity(3): t_concept = self.domain.concepts[t_name] for v in t_concept.variables: if v.sort == w.sort: variables = [x for x in t_concept.variables if x is not v] formula = substitute(t_concept.formula, {v: w}) name = str(formula) concept = Concept(name, variables, formula) result.append((name, concept)) return result
def _tagging2(self, sentence, num1_concept, num2_concept): region_start = num1_concept.start region_end = num2_concept.end entity = sentence[region_start:region_end] entity = "".join(entity) derived_class = RealNumber num1_value = num1_concept.concept_values['value'] num2_value = num2_concept.concept_values['value'] value = self.evaluate(num1_value, num2_value) concept_values = {'value': value} # creates a concept to wrap the above info concept = Concept(region_start, region_end, entity, derived_class, concept_values) self._on_create_concept(sentence, concept) sentence.add_concept(concept) self._on_add_concept(sentence, concept)
def clone_concept_with_replacing_parent( concept: Concept, mapping: Mapping[Concept, Concept], old_parent: Concept, new_parent: Concept) -> (Concept, bool): if concept in mapping: return mapping[concept], True np = [] # type: list(Concept) replaced = False for p in concept.parents: if p == old_parent: np.append(new_parent) replaced = True else: clone, flag = clone_concept_with_replacing_parent( p, mapping, old_parent, new_parent) np.append(clone) if flag: replaced = True result = Concept(concept.name, concept.relation, np, concept.probability) if replaced: mapping[concept] = result return result, replaced
def from_wikihow(cls, article): """ Extracts Concept information from a WikiHowArticle. """ things = {} actions = {} descriptors = {} for i, step in enumerate(article.steps): parse = parsetree(step.main, relations=True)[0] new_things = set( Concept(w.string, Concept.THING) for w in parse if w.pos.startswith('NN')) new_actions = set( Concept(w.string, Concept.ACTION) for w in parse if w.pos.startswith('VB')) new_thing_descriptors = set( Concept(w.string, Concept.DESCRIPTOR) for w in parse if w.pos.startswith('JJ')) new_action_descriptors = set( Concept(w.string, Concept.DESCRIPTOR) for w in parse if w.pos.startswith('RB')) if len(step.extra) > 0: for parse in parsetree(step.extra): more_thing_descriptors = set( Concept(w.string, Concept.DESCRIPTOR) for w in parse if w.pos.startswith('JJ')) new_thing_descriptors.update(more_thing_descriptors) more_action_descriptors = set( Concept(w.string, Concept.DESCRIPTOR) for w in parse if w.pos.startswith('RB')) new_action_descriptors.update(more_action_descriptors) for thing in new_things: for other_stuff in set.union(new_actions, new_thing_descriptors): if other_stuff.lemma != thing.lemma: thing.add_relation(other_stuff) if thing.lemma not in things: things[thing.lemma] = thing else: things[thing.lemma].merge_relations(thing) for action in new_actions: for other_stuff in set.union(new_things, new_action_descriptors): if other_stuff.lemma != action.lemma: action.add_relation(other_stuff) if action.lemma not in actions: actions[action.lemma] = action else: actions[action.lemma].merge_relations(action) for descriptor in new_thing_descriptors: for thing in new_things: descriptor.add_relation(thing) if descriptor.lemma not in descriptors: descriptors[descriptor.lemma] = descriptor else: descriptors[descriptor.lemma].merge_relations(descriptor) for descriptor in new_action_descriptors: for action in new_actions: descriptor.add_relation(action) if descriptor.lemma not in descriptors: descriptors[descriptor.lemma] = descriptor else: descriptors[descriptor.lemma].merge_relations(descriptor) concepts = ConceptSet(things, actions, descriptors) return cls(concepts, article)
def __init__(self, config_filename): course_config = CourseConfig(config_filename) self.concepts = {} with open(course_config.concepts_filename, 'r') as concepts_file: for line in concepts_file: ccp_info = line.split('\n')[0].split(';') abbreviation = ccp_info[0] concept_name = ccp_info[1] self.concepts[abbreviation] = Concept(concept_name, abbreviation) self.learning_materials = {} # TODO(andre:2018-05-19): Mover procedimento de leitura de arquivo LOM # para dentro da classe LearningMaterial for root, dirs, files in os.walk(course_config.learning_materials_lom): for lom_file in files: if lom_file.endswith('.xml'): tree = xml.parse(os.path.join(root, lom_file)) xml_root = tree.getroot() pref = xml_root.tag.split('}')[0] + '}' material_id = int( xml_root.find('./' + pref + 'general/' + pref + 'identifier/' + pref + 'entry').text) material_name = xml_root.find('./' + pref + 'general/' + pref + 'title/' + pref + 'string').text material_type = xml_root.find('./' + pref + 'technical/' + pref + 'format').text typical_learning_time = xml_root.find( './' + pref + 'educational/' + pref + 'typicalLearningTime/' + pref + 'duration').text difficulty = xml_root.find('./' + pref + 'educational/' + pref + 'difficulty/' + pref + 'value').text interactivity_level = xml_root.find('./' + pref + 'educational/' + pref + 'interactivityLevel/' + pref + 'value').text interactivity_type = xml_root.find('./' + pref + 'educational/' + pref + 'interactivityType/' + pref + 'value').text learning_resource_type = [] for i in xml_root.findall('./' + pref + 'educational/' + pref + 'learningResourceType/' + pref + 'value'): learning_resource_type.append(i.text) learning_material = LearningMaterial( material_id, material_name, material_type, typical_learning_time, difficulty, learning_resource_type, interactivity_level, interactivity_type) self.learning_materials[material_id] = learning_material with open(course_config.learning_materials_filename, 'r') as learning_materials_file: for line in learning_materials_file: ccp_info = line.split('\n')[0].split(';') learning_material_id = int(ccp_info[0]) learning_material = self.learning_materials[ learning_material_id] for i in range(2, len(ccp_info)): concept_abbreviation = ccp_info[i] concept_material = self.concepts[concept_abbreviation] if learning_material.covered_concepts is None: # learning_material.covered_concepts = [] learning_material.covered_concepts = {} # learning_material.covered_concepts.append(concept_material) learning_material.covered_concepts[ concept_abbreviation] = concept_material if concept_material.learning_materials is None: # concept_material.learning_materials = [] concept_material.learning_materials = {} # concept_material.learning_materials.append(learning_material) concept_material.learning_materials[ learning_material_id] = learning_material self.learners = {} with open(course_config.learners_filename, 'r') as learners_file: for line in learners_file: ccp_info = line.split('\n')[0].split(';') if len(ccp_info) > 7: # learning_goals = [] learning_goals = {} for i in range(7, len(ccp_info)): learner_learning_goal = ccp_info[i] # learning_goals.append(self.concepts[learner_learning_goal]) learning_goals[learner_learning_goal] = ( self.concepts[learner_learning_goal]) registration_code = ccp_info[0] learner_lower_time = float(ccp_info[1]) learner_upper_time = float(ccp_info[2]) active_reflexive = int(ccp_info[3]) sensory_intuitive = int(ccp_info[4]) visual_verbal = int(ccp_info[5]) sequential_global = int(ccp_info[6]) learner = Learner(registration_code, learner_lower_time, learner_upper_time, active_reflexive, sensory_intuitive, visual_verbal, sequential_global, learning_goals) self.learners[registration_code] = learner with open(course_config.learners_score_filename, 'r') as learners_score_file: concept = None for line in learners_score_file: ccp_info = line.split('\n')[0].split(';') learner_registration_code = ccp_info[0] concept_abbreviation = ccp_info[1] concept_score = float(ccp_info[2]) learner = self.learners[learner_registration_code] concept = self.concepts[concept_abbreviation] if learner.score is None: learner.score = {} learner.score[concept.abbreviation] = concept_score
def parse_simple(self, root) -> Iterable: if root.lemma_ == 'be': attr = self.dep(root, 'attr') if attr and (attr.pos_ == 'NOUN' or attr.pos_ == 'PROPN'): rel = Relation.Class prep = self.dep(root, 'prep') if prep is not None and prep.lemma_ == 'like': rel = Relation.Feature attr_concept = Concept.word(self.name(attr)) poss = self.dep(attr, "poss") if poss is not None: attr_concept = Concept( None, Relation.Part, [Concept.word(self.name(poss)), attr_concept]) rel = Relation.Identical for subj in self.subj(root): # type: Concept if subj.relation == Relation.Part: yield Concept(None, Relation.Identical, [attr_concept, subj]) else: yield Concept(None, rel, [subj, attr_concept]) acomp = self.dep(root, 'acomp') if acomp and acomp.pos_ == 'ADJ': for subj in self.subj(root): yield Concept(None, Relation.Feature, [subj, Concept.word(self.name(acomp))]) elif root.pos_ == 'VERB': for subj in self.subj(root): rel = Relation.Feature aux = self.dep(root, 'aux') mark = self.dep(root, 'mark') if aux is not None and aux.lemma_ == 'be': # continuous rel = Relation.Action if aux is not None and aux.lemma_ == 'do' and aux.tag_ == 'VBD': # past tense with aux do rel = Relation.Action if root.tag_ == 'VBD': # past tense rel = Relation.Action if mark is not None and mark.lemma_ == 'if': # conditional rel = Relation.Action concept = Concept(None, rel, [subj, Concept.word(self.name(root))]) features = [] npadvmod = self.dep(root, 'npadvmod') if npadvmod is not None: features.append( Concept(None, Relation.Time, [Concept.word(self.name(npadvmod))])) advmod = self.dep(root, 'advmod') if advmod is not None: if advmod.lemma_ == 'where': features.append( Concept(None, Relation.Relative, [Concept.word('?'), Concept.word('?')])) if advmod.lemma_ == 'when': features.append( Concept(None, Relation.Time, [Concept.word('?')])) for prep in self.deps(root, 'prep'): while prep is not None: obj = self.dep(prep, 'pobj') obj_name = '?' if obj is not None: obj_name = self.name(obj) features.append( Concept(None, Relation.Relative, [ Concept.word(self.name(prep)), Concept.word(obj_name) ])) prep = self.dep(prep, 'prep') if len(features) > 0: concept = Concept(None, Relation.Feature, [concept] + features) yield concept
UnaryRelation = FunctionSort(S, Boolean) BinaryRelation = FunctionSort(S, S, Boolean) X, Y, Z = (Var(n, S) for n in ['X', 'Y', 'Z']) U = Var('U', UnaryRelation) U1 = Var('U1', UnaryRelation) U2 = Var('U2', UnaryRelation) B = Var('B', BinaryRelation) B1 = Var('B1', BinaryRelation) B2 = Var('B2', BinaryRelation) nstar = Const('nstar', BinaryRelation) x = Const('x', S) y = Const('y', S) c11 = Concept('xy', [X], And(Eq(x, X), Eq(y, X))) c10 = Concept('x', [X], And(Eq(x, X), Not(Eq(y, X)))) c01 = Concept('y', [X], And(Not(Eq(x, X)), Eq(y, X))) c00 = Concept('other', [X], And(Not(Eq(x, X)), Not(Eq(y, X)))) cnstar = Concept('nstar', [X, Y], nstar(X, Y)) cnplus = Concept('nplus', [X, Y], And(nstar(X, Y), Not(Eq(X, Y)))) notexists = ConceptCombiner([U], Not(Exists([X], U(X)))) exists = ConceptCombiner([U], Exists([X], U(X))) singleton = ConceptCombiner([U], ForAll([X, Y], Implies(And(U(X), U(Y)), Eq(X, Y)))) all_to_all = ConceptCombiner([U1, U2, B], ForAll([X, Y], Implies(And(U1(X), U2(Y)), B(X, Y))))
def concepts_to_vars(self): concept_num = 0 for sent_index, sent_tuple_list in enumerate(self.tuples): # print sent_tuple_list # saves the concepts inside of each tuple current_tuple_concept_list = [] # add concept object in concepts list for _tuple in sent_tuple_list: for key in _tuple.keys(): if key in ['text', 'confidence']: # print _tuple[key] continue for pair in _tuple[key]: start_index = pair[0] end_index = pair[1] # get var set from the given range of words var_set = self.get_var_set(start_index=start_index, end_index=end_index) # print 'var_set', var_set # connect un-connected componets of the concept relevant_text = self.text.split()[pair[0]:pair[1]] # print relevant_text var_set, _ = self.graph.connect_unconnected_components( nodes=var_set) var_set = set(var_set) # print 'var_set ', var_set # create the concept self.concept_list.append( Concept(name='concept' + str(concept_num), var_set=var_set, sent_index=sent_index, lable=key)) current_tuple_concept_list.append(concept_num) # add in sent_to_concept_indices list try: self.sent_to_concept_indices[sent_index].append( concept_num) except: self.sent_to_concept_indices[sent_index] = [ concept_num ] concept_num += 1 # updating the full partners list self.partners.append(current_tuple_concept_list) self.concept_names.extend(current_tuple_concept_list) # updating sent_to_partner_indices list try: self.sent_to_partner_indices[sent_index].append( current_tuple_concept_list) except: self.sent_to_partner_indices[sent_index] = [ current_tuple_concept_list ] # set partners for each concept in the concept list for concept in self.concept_list: if concept.name in current_tuple_concept_list: concept.add_partners(current_tuple_concept_list)
def _tag(self, sentence, index=0): region_start = index region_end = None ch = None digit_buffer = [] for i in range(index, sentence.length()): ch = sentence[i] # cht: 零、壹、貳、參、肆、伍、陸、柒、捌、玖 # chs: 零,壹,贰,参,肆,伍,陆,柒,捌,玖 if ch in '零00OO': # digit: 0,0 # alpha: O(half-width)O(full-width) digit_buffer.append('0') continue elif ch in '一壹ㄧ': # 一: 19968, 0x4e00 # ㄧ: 12583, 0x3127 (注音符號 ㄧ ㄨ ㄩ) digit_buffer.append('1') continue elif ch in '二貳贰': digit_buffer.append('2') continue elif ch in '三參叁参': digit_buffer.append('3') continue elif ch in '四肆': digit_buffer.append('4') continue elif ch in '五伍': digit_buffer.append('5') continue elif ch in '六陸陆': digit_buffer.append('6') continue elif ch in '七柒': digit_buffer.append('7') continue elif ch in '八捌': digit_buffer.append('8') continue elif ch in '九玖': digit_buffer.append('9') continue else: region_end = i break # end-of-if # end-of-for if region_end == None: region_end = sentence.length() # end-of-if if region_end > region_start: entity = sentence[region_start:region_end] entity = "".join(entity) concept_values = { 'tagger': self.__class__, 'value': int(''.join(digit_buffer)), } concept = Concept(region_start, region_end, entity, self.__CLASS, concept_values) sentence.add_concept(concept) # end-of-if # not meet the ending, continue to tag? if region_end < sentence.length(): if region_start == region_end: # not found the entity self._tag(sentence, region_end + 1) else: # found the entity self._tag(sentence, region_end)
def main(): # f_fasttext_vec_file = 'wiki.en.align.vec' # e_fasttext_vec_file = 'wiki.ro.align.vec' f_fasttext_model_file = 'en-ro-bpe-16K-en.bin' e_fasttext_model_file = 'en-ro-bpe-16K-ro.bin' f_fasttext_model_file = 'en_embedding_fasttext_model.bin' e_fasttext_model_file = 'ro_embedding_fasttext_model.bin' concept_count_v2 = 'weights/en-ro-bpe-count-p_concept-v2.weight' # concept_viterbi_v2 = 'weights/en-ro-viterbi-p_concept-v2.weight' # concept_activeset_v2 = 'weights/en-ro-activeset-p_concept-v2.weight' concept_init_weight_file = concept_viterbi_v2 concept = Concept(get_pair_hashcode) concept.load_p_concept(concept_init_weight_file) emalgo = EmAlgo() active_set = ActiveSet(20) train_dataset = AlignDataset(f_train_filename, e_train_filename) test_dataset = AlignDataset(f_test_filename, e_test_filename) # f_pretrained_model = load_vectors(f_fasttext_vec_file) # e_pretrained_model = load_vectors(e_fasttext_vec_file) # embedding_model = FastTextPretrainedEmbedding(f_pretrained_model, e_pretrained_model) # embedding_model = load_fasttext_embedding_model(f_fasttext_model_file, e_fasttext_model_file) # embedding_model = BertEmbedding() embedding_model = load_fasttext_embedding_model(f_fasttext_model_file, e_fasttext_model_file) #embedding_model = FastTextEmbedding(f_model, e_model) network = embedding.Align(embedding_model) sgd = SGD(network.parameters(), lr=0.0005) score_before_init = evaluate(emalgo.align_wedding, concept, test_dataset.src_sentences, test_dataset.trg_sentences, true_label_file, threshld=0.1, bpe=True, src_idx=src_bpe_idx_data, trg_idx=trg_bpe_idx_data) print('wedding concept score before init: ', score_before_init) score_before_init = evaluate(emalgo.align_wedding, embedding_model, test_dataset.src_sentences, test_dataset.trg_sentences, true_label_file, threshld=0.1, bpe=True, src_idx=src_bpe_idx_data, trg_idx=trg_bpe_idx_data) print('wedding embedding score before init : ', score_before_init) start_by_step_e(train_dataset, concept, emalgo.align_wedding, embedding_model, sgd, 100) score_after_init = evaluate(active_set.align_sparsemap, embedding_model, test_dataset.src_sentences, test_dataset.trg_sentences, true_label_file, threshld=0.2, bpe=True, src_idx=src_bpe_idx_data, trg_idx=trg_bpe_idx_data) print('sparsemap embedding after step e : ', score_after_init) sgd = SGD(network.parameters(), lr=0.0001) train_network(train_dataset, network, sgd, epoch=2, batch=100) thresholds = [i*0.1 for i in range(1,11)] score_after_trainings = [] for thr in thresholds: score_after_training = evaluate(active_set.align_sparsemap, embedding_model, test_dataset.src_sentences, test_dataset.trg_sentences, true_label_file, threshld=thr, bpe=True, src_idx=src_bpe_idx_data, trg_idx=trg_bpe_idx_data) score_after_trainings.append(score_after_training) print('score after training : ', score_after_trainings) print('best score after training : ', np.min(score_after_trainings)) out_parameter_file = 'embedding_weight.txt' np.savetxt('out_parameter_file' , embedding_model.weight.detach().numpy()) print('embedding weight saved to : ', out_parameter_file)