示例#1
0
    def get_concepts(self):
        """
        Return all concepts (unpublished blogs).

        Returns:
            list: List of Concept objects.
        """
        if not self.has_blog:
            raise ValueError("User doesn't have blog!")

        self.login()

        # get the f*****g untagged part of the site, where the links to the
        # concepts are stored
        data = self._get(self.blog_url)

        if '<div class="s_nadpis">Rozepsané zápisy</div>' not in data:
            return []

        data = data.split('<div class="s_nadpis">Rozepsané zápisy</div>')[1]

        dom = dhtmlparser.parseString(data)
        concept_list = dom.find("div", {"class": "s_sekce"})[0]

        # links to concepts are stored in <li>
        concepts = []
        for li in concept_list.find("li"):
            a = li.find("a")[0]

            concepts.append(
                Concept(title=a.getContent().strip(),
                        link=a.params["href"],
                        session=self.session))

        return concepts
示例#2
0
    def _tagging(self, sentence, 
                 region_start, region_end, 
                 premodifier: 'e.g. a number, like 至少 of "至少22k"', 
                 posttoken: 'e.g. a , like 22k of "至少22k"'):
        
        entity = sentence[region_start : region_end]
        entity = "".join(entity)

        # handled class: the derived class itself
        derived_class = self.__class__
        
        concept_values = {
            self.get_premodifier_label(): premodifier,
            self.get_posttoken_label(): posttoken
        }
        
        # creates a concept to wrap the above info
        concept = Concept(
            region_start, region_end, 
            entity, derived_class, concept_values)
        concept.sign(_Premodifier)
        self._on_create_concept(sentence, concept)

        sentence.add_concept(concept)
        self._on_add_concept(sentence, concept)
示例#3
0
文件: store.py 项目: szroland/nlu
    def integrate(self, concept: Concept) -> Concept:
        integrated = self.get_concept(concept)
        if integrated is None:
            ip = []  # type: list[Concept]
            for p in concept.parents:
                ip.append(self.integrate(p))

            integrated = Concept(concept.name, concept.relation, ip,
                                 concept.probability)
            integrated.store = self
            integrated.register_with_parents()
            self.add_concept(integrated)

            if integrated.relation == Relation.Word:
                word = integrated.name
                if word not in self._words:
                    w = Word(word)
                    w.add_meaning(integrated)
                    self._words[word] = w
            elif integrated.relation == Relation.Implication:
                integrated.propagate_probability_from_parent(
                    integrated.parents[0])

        else:
            integrated.merge_probability(concept.probability)

        return integrated
    def _tagging_number_and_unit(self, sentence, region_start, region_end,
                                 number_concept: 'a number, like 22 of 22k',
                                 synonym: 'a unit, like k of 22k'):

        entity = sentence[region_start:region_end]
        entity = "".join(entity)

        # handled class
        derived_class = RealNumber  #self.__class__

        # 22k = 22 x 1000 = 22000
        number = number_concept.concept_values['value']
        value = number * self.get_unit_size()

        concept_values = {
            'number': number,
            'metric_prefix': synonym,
            'value': value
        }

        # creates a concept to wrap the above info
        concept = Concept(region_start, region_end, entity, derived_class,
                          concept_values)
        concept.sign(_MetricPrefixUnit)
        self._on_create_concept(sentence, concept)

        sentence.add_concept(concept)
        self._on_add_concept(sentence, concept)
示例#5
0
文件: parser.py 项目: szroland/nlu
def parse_compound_concept(expression: str, store: Store, label: str=None, probability: float=1.0):
    s = expression.find('(')
    e = expression.rfind(')')

    rel = Relation.find(expression[:s])

    args = []  # type: list[str]
    d = 0
    arg_start = s+1
    for x in range(s+1, e):
        c = expression[x]
        if c == '(':
            d += 1
        elif c == ')':
            d -= 1
        elif c == ',':
            if d == 0:
                args.append(expression[arg_start:x])
                arg_start = x+1

    args.append(expression[arg_start:e])
    arg_concepts = []  # type: list[Concept]
    arg_probability = probability
    if rel == Relation.Implication:
        arg_probability = .5

    for a in args:
        arg = parse(a.strip(), store, probability=arg_probability)
        arg_concepts.append(arg)

    if label is None and e < len(expression)-1:
        label = expression[e+1:].strip()

    return Concept(label, rel, arg_concepts, probability=probability)
示例#6
0
 def _create_concept(self, region_start, region_end, entity, concept_type,
                     concept_values) -> Concept:
     '''
     Creates a concept to wrap the above info.
     
     If the concept is None, it means the concept is rejected.
     '''
     return Concept(region_start, region_end, entity, concept_type,
                    concept_values)
示例#7
0
 def add_document(self, doc):
     """ Converts document to concept 
         @param doc: should have doc_id, rev_id, title and clean_text
     """
     _log.debug("addind document {0}".format(doc.title))
     cid = self._generate_concept_id(doc)
     word_list = self.stemmer.process_text(doc.clean_text)
     new_concept = Concept(cid, doc.title, word_list, doc.rev_id)
     self.concepts_list.append(new_concept)
     self.ids.add(doc.id)
示例#8
0
    def parse_sentence(self, root) -> Iterable:
        advcl = self.dep(root, 'advcl')
        mark = self.dep(advcl, 'mark')
        if advcl and mark and mark.lemma_ == 'if':
            condition = next(self.parse_simple(advcl))
            for action in self.parse_simple(root):
                yield Concept(None, Relation.Implication, [condition, action])

        else:
            for c in self.parse_simple(root):
                yield c
示例#9
0
    def test_uses_concepts_in_parent(self):
        parent = Store()
        child = Store(parent)

        raw = Concept(
            None, Relation.Class,
            [Concept.word('c1'), Concept.word('c2')])
        in_parent = parent.integrate(raw)
        in_child = child.integrate(raw)

        self.assertEqual(id(in_parent), id(in_child))
示例#10
0
    def create_virtual_concept(self):
        '''
        Creates a virtual concept for some adjectives.
        '''

        entity = self.get_synonym_list()[0]
        return Concept(start=-1,
                       end=-1,
                       entity=entity,
                       concept_type=self.__class__,
                       concept_values={'unit': entity})
    def _tag(self, sentence, index=0):
        region_start = None
        region_end = None
        at = length = 0

        for i in range(index, sentence.length()):

            # finds the metric-prefix unit
            synonym = self.match_prefix_synonym_at(sentence, i)
            # all of synonyms are concrate concepts, not abstract

            if synonym == None:
                continue
            # end-of-if

            # tags the unit itself
            self._tagging_unit(sentence,
                               region_start=i,
                               region_end=i + len(synonym),
                               synonym=synonym)

            # parses the number part if existed
            at = i - 1
            at = self._skip_whitespaces_reversely(sentence, at)
            if at < 0:
                # a metric-prefix unit without the number part
                # such as '萬一', '千二', '百九'
                # the default numbber is 1

                number_concept = Concept(
                    start=i,
                    end=i,  # a dummy number
                    entity='1',
                    concept_type=IntegerNumber,
                    concept_values={'value': 1})
            else:
                # a metric-prefix unit with the number part
                # such as '一萬一', '一千二', '一百九'

                number_concept = sentence.get_suffix_dominated_concept(
                    at, IntegerNumber, RealNumber)
                if number_concept == None:
                    continue
                # end-of-if
            # end-of-if

            # tags the [number unit]
            self._tagging_number_and_unit(sentence,
                                          region_start=number_concept.start,
                                          region_end=i + len(synonym),
                                          number_concept=number_concept,
                                          synonym=synonym)
示例#12
0
    def subj(self, sentence) -> Iterable[Concept]:
        s = self.dep(sentence, 'nsubj')
        while s is not None:
            p = self.dep(s, 'poss')
            if p is None:
                yield Concept.word(self.name(s))
            else:
                yield Concept(
                    None, Relation.Part,
                    [Concept.word(self.name(p)),
                     Concept.word(self.name(s))])

            s = self.dep(s, 'conj')
示例#13
0
 def __create_lattice__(self):
     if self.dataframe is None:
         print(
             "Load the dataframe first\n__add_dataframe__(<type pandas.DataFrame>)"
         )
         return None
     # will contain a list of concepts (extent, intent)
     concepts = list()
     extents = self.__get_extent_list__()
     for ex in extents:
         # print(ex, end=" --> ")
         # print(self.__get_intent__(ex))
         concepts.append(Concept(set(ex), self.__get_intent__(ex)))
     return concepts
示例#14
0
    def __init__(self, sentence):
        '''
        Sample:
            sentence: "我要17.3吋筆電"
            sequence: '17.3吋'
             - start: 2
             - end: 7 (excluded)
             - entity: '17.3吋'
             - concept_type: NumberInchUnit
             - concept_values: {'value': '17.3', 'unit': '吋'}
        '''
        super(Sentence, self).__init__(sentence)

        # holds length, sentence
        self.__length = len(sentence)
        self.__sentence = sentence

        # initializes the prefix/suffux concept list
        self.__prefix_concept_list = [None] * self.__length
        self.__suffix_concept_list = [None] * self.__length

        for idx in range(self.__length):
            start = idx
            end = idx + 1  # excluded (i.e. not included)
            entity = self[idx]  # i.e. the char itself
            concept_type = None  # no concept

            self.__prefix_concept_list[idx] \
                = [Concept(start, end, entity, concept_type)]

            # note: index = end - 1
            self.__suffix_concept_list[idx] \
                = [Concept(start, end, entity, concept_type)]
        # end-of-for

        # used to save taggers which have already been executed
        self.__tagger_set = set()
示例#15
0
    def _tag(self, sentence, index=0):
        region_start = index
        region_end = None
        ch = None

        for i in range(index, sentence.length()):
            ch = sentence[i]

            # normalize
            # '0'(=65296) -> '0'
            # ...
            # '9'(=65305) -> '9'
            if '0' <= ch and ch <= '9':
                ch = chr(ord(ch) - ord('0') + ord('0'))

                # update back to the source
                sentence[i] = ch
            # end-of-if

            if '0' <= ch and ch <= '9':
                continue
            else:
                region_end = i
                break
            # end-of-if
        # end-of-for

        if region_end == None:
            region_end = sentence.length()
        # end-of-if

        if region_end > region_start:
            entity = sentence[region_start:region_end]
            entity = "".join(entity)
            concept_values = {'value': int(entity)}
            concept = Concept(region_start, region_end, entity, self.__CLASS,
                              concept_values)
            sentence.add_concept(concept)
        # end-of-if

        # not meet the ending, continue to tag?
        if region_end < sentence.length():
            if region_start == region_end:
                # not found the entity
                self._tag(sentence, region_end + 1)
            else:
                # found the entity
                self._tag(sentence, region_end)
示例#16
0
    def _tagging1(self, sentence, region_start, region_end, synonym):

        entity = sentence[region_start:region_end]
        entity = "".join(entity)

        derived_class = self.__class__

        concept_values = {'operator': self.get_formal_operator()}

        # creates a concept to wrap the above info
        concept = Concept(region_start, region_end, entity, derived_class,
                          concept_values)
        self._on_create_concept(sentence, concept)

        sentence.add_concept(concept)
        self._on_add_concept(sentence, concept)
示例#17
0
    def _tag(self, sentence, index=0):
        region_start = None
        region_end = None
        ch = None

        for i in range(index, sentence.length()):
            concept_list = sentence.get_prefix_concept_list(i, IntegerNumber)
            if len(concept_list) == 0:
                continue
            # end-of-if

            for integer_concept in concept_list:
                at = integer_concept.end
                if at >= sentence.length():
                    break
                # end-of-if

                ch = sentence[at]
                if ch != '.':
                    continue
                else:
                    at += 1
                # end-of-if

                # found a floating point
                decimal_concept_list = sentence.get_prefix_concept_list(
                    at, IntegerNumber)
                if len(decimal_concept_list) == 0:
                    continue
                else:
                    decimal_concept = decimal_concept_list[0]
                # end-of-if

                region_start = integer_concept.start
                region_end = decimal_concept.end

                entity = sentence[region_start:region_end]
                entity = "".join(entity)
                concept_values = {'value': float(entity)}

                sentence.remove_concept(integer_concept)
                sentence.remove_concept(decimal_concept)

                concept = Concept(region_start, region_end, entity,
                                  self.__CLASS, concept_values)
                sentence.add_concept(concept)
示例#18
0
文件: nlu.py 项目: szroland/nlu
    def create_answer(question: Concept, mapping: Mapping[Concept,
                                                          Concept]) -> Concept:
        if mapping is None:
            return None

        # simple
        if question.is_simple():
            if question in mapping:
                return mapping[question]
            return question

        # compound
        ap = []  # type: list[Concept]
        for p in question.parents:
            ap.append(NLU.create_answer(p, mapping))

        return Concept(question.name, question.relation, ap)
    def _tagging_unit(self, sentence, region_start, region_end,
                      synonym: 'just the unit itself'):

        entity = sentence[region_start:region_end]
        entity = "".join(entity)

        # handled class
        derived_class = self.__class__

        concept_values = {'unit': synonym, 'value': self.get_unit_size()}

        # creates a concept to wrap the above info
        concept = Concept(region_start, region_end, entity, derived_class,
                          concept_values)
        concept.sign(_MetricPrefixUnit)
        #self._on_create_concept(sentence, concept)

        sentence.add_concept(concept)
示例#20
0
 def _materialize_node(self, concept_name):
     """
     Materialize a node, returns the witness constant
     """
     concept = self.domain.concepts[concept_name]
     assert concept.arity == 1
     sort = concept.variables[0].sort
     assert sort != TopSort()
     witnesses = self._get_witnesses(concept_name)
     if len(witnesses) > 0:
         c = witnesses[0]
     else:
         c = Const(self._fresh_const_name(), sort)
         # TODO: maybe we shouldn't split here, and create the concepts explicitly
         X = Var('X', c.sort)
         name = '={}'.format(c.name)
         self.domain.concepts[name] = Concept(name, [X], Eq(X, c))
         self.domain.split(concept_name, name)
     self.suppose(concept(c))
     return c
示例#21
0
    def get_projections(self, node):
        """
        Return a list of (name, binary_concept) with all possible
        projections at node
        """
        witnesses = self._get_witnesses(node)
        if len(witnesses) == 0:
            return []

        w = witnesses[0]
        result = []
        n_concept = self.domain.concepts[node]
        for t_name in self.domain.concepts_by_arity(3):
            t_concept = self.domain.concepts[t_name]
            for v in t_concept.variables:
                if v.sort == w.sort:
                    variables = [x for x in t_concept.variables if x is not v]
                    formula = substitute(t_concept.formula, {v: w})
                    name = str(formula)
                    concept = Concept(name, variables, formula)
                    result.append((name, concept))
        return result
示例#22
0
    def _tagging2(self, sentence, num1_concept, num2_concept):

        region_start = num1_concept.start
        region_end = num2_concept.end
        entity = sentence[region_start:region_end]
        entity = "".join(entity)

        derived_class = RealNumber

        num1_value = num1_concept.concept_values['value']
        num2_value = num2_concept.concept_values['value']
        value = self.evaluate(num1_value, num2_value)

        concept_values = {'value': value}

        # creates a concept to wrap the above info
        concept = Concept(region_start, region_end, entity, derived_class,
                          concept_values)
        self._on_create_concept(sentence, concept)

        sentence.add_concept(concept)
        self._on_add_concept(sentence, concept)
示例#23
0
文件: generator.py 项目: szroland/nlu
def clone_concept_with_replacing_parent(
        concept: Concept, mapping: Mapping[Concept, Concept],
        old_parent: Concept, new_parent: Concept) -> (Concept, bool):
    if concept in mapping:
        return mapping[concept], True

    np = []  # type: list(Concept)
    replaced = False
    for p in concept.parents:
        if p == old_parent:
            np.append(new_parent)
            replaced = True
        else:
            clone, flag = clone_concept_with_replacing_parent(
                p, mapping, old_parent, new_parent)
            np.append(clone)
            if flag:
                replaced = True

    result = Concept(concept.name, concept.relation, np, concept.probability)
    if replaced:
        mapping[concept] = result
    return result, replaced
示例#24
0
    def from_wikihow(cls, article):
        """
        Extracts Concept information from a WikiHowArticle.
        """

        things = {}
        actions = {}
        descriptors = {}

        for i, step in enumerate(article.steps):
            parse = parsetree(step.main, relations=True)[0]

            new_things = set(
                Concept(w.string, Concept.THING) for w in parse
                if w.pos.startswith('NN'))
            new_actions = set(
                Concept(w.string, Concept.ACTION) for w in parse
                if w.pos.startswith('VB'))
            new_thing_descriptors = set(
                Concept(w.string, Concept.DESCRIPTOR) for w in parse
                if w.pos.startswith('JJ'))
            new_action_descriptors = set(
                Concept(w.string, Concept.DESCRIPTOR) for w in parse
                if w.pos.startswith('RB'))

            if len(step.extra) > 0:
                for parse in parsetree(step.extra):
                    more_thing_descriptors = set(
                        Concept(w.string, Concept.DESCRIPTOR) for w in parse
                        if w.pos.startswith('JJ'))
                    new_thing_descriptors.update(more_thing_descriptors)

                    more_action_descriptors = set(
                        Concept(w.string, Concept.DESCRIPTOR) for w in parse
                        if w.pos.startswith('RB'))
                    new_action_descriptors.update(more_action_descriptors)

            for thing in new_things:
                for other_stuff in set.union(new_actions,
                                             new_thing_descriptors):
                    if other_stuff.lemma != thing.lemma:
                        thing.add_relation(other_stuff)

                if thing.lemma not in things:
                    things[thing.lemma] = thing
                else:
                    things[thing.lemma].merge_relations(thing)

            for action in new_actions:
                for other_stuff in set.union(new_things,
                                             new_action_descriptors):
                    if other_stuff.lemma != action.lemma:
                        action.add_relation(other_stuff)

                if action.lemma not in actions:
                    actions[action.lemma] = action
                else:
                    actions[action.lemma].merge_relations(action)

            for descriptor in new_thing_descriptors:
                for thing in new_things:
                    descriptor.add_relation(thing)

                if descriptor.lemma not in descriptors:
                    descriptors[descriptor.lemma] = descriptor
                else:
                    descriptors[descriptor.lemma].merge_relations(descriptor)

            for descriptor in new_action_descriptors:
                for action in new_actions:
                    descriptor.add_relation(action)

                if descriptor.lemma not in descriptors:
                    descriptors[descriptor.lemma] = descriptor
                else:
                    descriptors[descriptor.lemma].merge_relations(descriptor)

        concepts = ConceptSet(things, actions, descriptors)

        return cls(concepts, article)
示例#25
0
    def __init__(self, config_filename):
        course_config = CourseConfig(config_filename)

        self.concepts = {}
        with open(course_config.concepts_filename, 'r') as concepts_file:
            for line in concepts_file:
                ccp_info = line.split('\n')[0].split(';')
                abbreviation = ccp_info[0]
                concept_name = ccp_info[1]
                self.concepts[abbreviation] = Concept(concept_name,
                                                      abbreviation)

        self.learning_materials = {}

        # TODO(andre:2018-05-19): Mover procedimento de leitura de arquivo LOM
        # para dentro da classe LearningMaterial
        for root, dirs, files in os.walk(course_config.learning_materials_lom):
            for lom_file in files:
                if lom_file.endswith('.xml'):
                    tree = xml.parse(os.path.join(root, lom_file))

                    xml_root = tree.getroot()

                    pref = xml_root.tag.split('}')[0] + '}'

                    material_id = int(
                        xml_root.find('./' + pref + 'general/' + pref +
                                      'identifier/' + pref + 'entry').text)
                    material_name = xml_root.find('./' + pref + 'general/' +
                                                  pref + 'title/' + pref +
                                                  'string').text
                    material_type = xml_root.find('./' + pref + 'technical/' +
                                                  pref + 'format').text
                    typical_learning_time = xml_root.find(
                        './' + pref + 'educational/' + pref +
                        'typicalLearningTime/' + pref + 'duration').text
                    difficulty = xml_root.find('./' + pref + 'educational/' +
                                               pref + 'difficulty/' + pref +
                                               'value').text
                    interactivity_level = xml_root.find('./' + pref +
                                                        'educational/' + pref +
                                                        'interactivityLevel/' +
                                                        pref + 'value').text
                    interactivity_type = xml_root.find('./' + pref +
                                                       'educational/' + pref +
                                                       'interactivityType/' +
                                                       pref + 'value').text
                    learning_resource_type = []

                    for i in xml_root.findall('./' + pref + 'educational/' +
                                              pref + 'learningResourceType/' +
                                              pref + 'value'):
                        learning_resource_type.append(i.text)

                    learning_material = LearningMaterial(
                        material_id, material_name, material_type,
                        typical_learning_time, difficulty,
                        learning_resource_type, interactivity_level,
                        interactivity_type)
                    self.learning_materials[material_id] = learning_material

        with open(course_config.learning_materials_filename,
                  'r') as learning_materials_file:
            for line in learning_materials_file:
                ccp_info = line.split('\n')[0].split(';')
                learning_material_id = int(ccp_info[0])
                learning_material = self.learning_materials[
                    learning_material_id]
                for i in range(2, len(ccp_info)):
                    concept_abbreviation = ccp_info[i]
                    concept_material = self.concepts[concept_abbreviation]

                    if learning_material.covered_concepts is None:
                        # learning_material.covered_concepts = []
                        learning_material.covered_concepts = {}
                    # learning_material.covered_concepts.append(concept_material)
                    learning_material.covered_concepts[
                        concept_abbreviation] = concept_material

                    if concept_material.learning_materials is None:
                        # concept_material.learning_materials = []
                        concept_material.learning_materials = {}
                    # concept_material.learning_materials.append(learning_material)
                    concept_material.learning_materials[
                        learning_material_id] = learning_material

        self.learners = {}
        with open(course_config.learners_filename, 'r') as learners_file:
            for line in learners_file:
                ccp_info = line.split('\n')[0].split(';')
                if len(ccp_info) > 7:
                    # learning_goals = []
                    learning_goals = {}
                    for i in range(7, len(ccp_info)):
                        learner_learning_goal = ccp_info[i]
                        # learning_goals.append(self.concepts[learner_learning_goal])
                        learning_goals[learner_learning_goal] = (
                            self.concepts[learner_learning_goal])

                    registration_code = ccp_info[0]
                    learner_lower_time = float(ccp_info[1])
                    learner_upper_time = float(ccp_info[2])
                    active_reflexive = int(ccp_info[3])
                    sensory_intuitive = int(ccp_info[4])
                    visual_verbal = int(ccp_info[5])
                    sequential_global = int(ccp_info[6])

                    learner = Learner(registration_code, learner_lower_time,
                                      learner_upper_time, active_reflexive,
                                      sensory_intuitive, visual_verbal,
                                      sequential_global, learning_goals)
                    self.learners[registration_code] = learner

        with open(course_config.learners_score_filename,
                  'r') as learners_score_file:
            concept = None
            for line in learners_score_file:
                ccp_info = line.split('\n')[0].split(';')
                learner_registration_code = ccp_info[0]
                concept_abbreviation = ccp_info[1]
                concept_score = float(ccp_info[2])
                learner = self.learners[learner_registration_code]
                concept = self.concepts[concept_abbreviation]

                if learner.score is None:
                    learner.score = {}
                learner.score[concept.abbreviation] = concept_score
示例#26
0
    def parse_simple(self, root) -> Iterable:
        if root.lemma_ == 'be':
            attr = self.dep(root, 'attr')
            if attr and (attr.pos_ == 'NOUN' or attr.pos_ == 'PROPN'):
                rel = Relation.Class
                prep = self.dep(root, 'prep')
                if prep is not None and prep.lemma_ == 'like':
                    rel = Relation.Feature
                attr_concept = Concept.word(self.name(attr))
                poss = self.dep(attr, "poss")
                if poss is not None:
                    attr_concept = Concept(
                        None, Relation.Part,
                        [Concept.word(self.name(poss)), attr_concept])
                    rel = Relation.Identical
                for subj in self.subj(root):  # type: Concept
                    if subj.relation == Relation.Part:
                        yield Concept(None, Relation.Identical,
                                      [attr_concept, subj])
                    else:
                        yield Concept(None, rel, [subj, attr_concept])
            acomp = self.dep(root, 'acomp')
            if acomp and acomp.pos_ == 'ADJ':
                for subj in self.subj(root):
                    yield Concept(None, Relation.Feature,
                                  [subj, Concept.word(self.name(acomp))])
        elif root.pos_ == 'VERB':
            for subj in self.subj(root):
                rel = Relation.Feature
                aux = self.dep(root, 'aux')
                mark = self.dep(root, 'mark')
                if aux is not None and aux.lemma_ == 'be':  # continuous
                    rel = Relation.Action
                if aux is not None and aux.lemma_ == 'do' and aux.tag_ == 'VBD':  # past tense with aux do
                    rel = Relation.Action
                if root.tag_ == 'VBD':  # past tense
                    rel = Relation.Action
                if mark is not None and mark.lemma_ == 'if':  # conditional
                    rel = Relation.Action

                concept = Concept(None, rel,
                                  [subj, Concept.word(self.name(root))])

                features = []
                npadvmod = self.dep(root, 'npadvmod')
                if npadvmod is not None:
                    features.append(
                        Concept(None, Relation.Time,
                                [Concept.word(self.name(npadvmod))]))

                advmod = self.dep(root, 'advmod')
                if advmod is not None:
                    if advmod.lemma_ == 'where':
                        features.append(
                            Concept(None, Relation.Relative,
                                    [Concept.word('?'),
                                     Concept.word('?')]))
                    if advmod.lemma_ == 'when':
                        features.append(
                            Concept(None, Relation.Time, [Concept.word('?')]))

                for prep in self.deps(root, 'prep'):
                    while prep is not None:
                        obj = self.dep(prep, 'pobj')
                        obj_name = '?'
                        if obj is not None:
                            obj_name = self.name(obj)

                        features.append(
                            Concept(None, Relation.Relative, [
                                Concept.word(self.name(prep)),
                                Concept.word(obj_name)
                            ]))
                        prep = self.dep(prep, 'prep')

                if len(features) > 0:
                    concept = Concept(None, Relation.Feature,
                                      [concept] + features)

                yield concept
    UnaryRelation = FunctionSort(S, Boolean)
    BinaryRelation = FunctionSort(S, S, Boolean)

    X, Y, Z = (Var(n, S) for n in ['X', 'Y', 'Z'])
    U = Var('U', UnaryRelation)
    U1 = Var('U1', UnaryRelation)
    U2 = Var('U2', UnaryRelation)
    B = Var('B', BinaryRelation)
    B1 = Var('B1', BinaryRelation)
    B2 = Var('B2', BinaryRelation)

    nstar = Const('nstar', BinaryRelation)
    x = Const('x', S)
    y = Const('y', S)

    c11 = Concept('xy', [X], And(Eq(x, X), Eq(y, X)))
    c10 = Concept('x', [X], And(Eq(x, X), Not(Eq(y, X))))
    c01 = Concept('y', [X], And(Not(Eq(x, X)), Eq(y, X)))
    c00 = Concept('other', [X], And(Not(Eq(x, X)), Not(Eq(y, X))))

    cnstar = Concept('nstar', [X, Y], nstar(X, Y))
    cnplus = Concept('nplus', [X, Y], And(nstar(X, Y), Not(Eq(X, Y))))

    notexists = ConceptCombiner([U], Not(Exists([X], U(X))))
    exists = ConceptCombiner([U], Exists([X], U(X)))
    singleton = ConceptCombiner([U],
                                ForAll([X, Y],
                                       Implies(And(U(X), U(Y)), Eq(X, Y))))
    all_to_all = ConceptCombiner([U1, U2, B],
                                 ForAll([X, Y],
                                        Implies(And(U1(X), U2(Y)), B(X, Y))))
示例#28
0
    def concepts_to_vars(self):
        concept_num = 0
        for sent_index, sent_tuple_list in enumerate(self.tuples):
            # print sent_tuple_list
            # saves the concepts inside of each tuple
            current_tuple_concept_list = []
            # add concept object in concepts list
            for _tuple in sent_tuple_list:
                for key in _tuple.keys():
                    if key in ['text', 'confidence']:
                        # print _tuple[key]
                        continue
                    for pair in _tuple[key]:
                        start_index = pair[0]
                        end_index = pair[1]

                        # get var set from the given range of words
                        var_set = self.get_var_set(start_index=start_index,
                                                   end_index=end_index)
                        # print 'var_set', var_set

                        # connect un-connected componets of the concept
                        relevant_text = self.text.split()[pair[0]:pair[1]]
                        # print relevant_text

                        var_set, _ = self.graph.connect_unconnected_components(
                            nodes=var_set)
                        var_set = set(var_set)
                        # print 'var_set ', var_set
                        # create the concept
                        self.concept_list.append(
                            Concept(name='concept' + str(concept_num),
                                    var_set=var_set,
                                    sent_index=sent_index,
                                    lable=key))
                        current_tuple_concept_list.append(concept_num)
                        # add in sent_to_concept_indices list
                        try:
                            self.sent_to_concept_indices[sent_index].append(
                                concept_num)
                        except:
                            self.sent_to_concept_indices[sent_index] = [
                                concept_num
                            ]
                        concept_num += 1
                # updating the full partners list
                self.partners.append(current_tuple_concept_list)
                self.concept_names.extend(current_tuple_concept_list)
                # updating sent_to_partner_indices list
                try:
                    self.sent_to_partner_indices[sent_index].append(
                        current_tuple_concept_list)
                except:
                    self.sent_to_partner_indices[sent_index] = [
                        current_tuple_concept_list
                    ]

                # set partners for each concept in the concept list
                for concept in self.concept_list:
                    if concept.name in current_tuple_concept_list:
                        concept.add_partners(current_tuple_concept_list)
示例#29
0
    def _tag(self, sentence, index=0):
        region_start = index
        region_end = None
        ch = None
        digit_buffer = []

        for i in range(index, sentence.length()):
            ch = sentence[i]

            # cht: 零、壹、貳、參、肆、伍、陸、柒、捌、玖
            # chs: 零,壹,贰,参,肆,伍,陆,柒,捌,玖
            if ch in '零00OO':
                # digit: 0,0
                # alpha: O(half-width)O(full-width)
                digit_buffer.append('0')
                continue
            elif ch in '一壹ㄧ':
                # 一: 19968, 0x4e00
                # ㄧ: 12583, 0x3127 (注音符號 ㄧ ㄨ ㄩ)
                digit_buffer.append('1')
                continue
            elif ch in '二貳贰':
                digit_buffer.append('2')
                continue
            elif ch in '三參叁参':
                digit_buffer.append('3')
                continue
            elif ch in '四肆':
                digit_buffer.append('4')
                continue
            elif ch in '五伍':
                digit_buffer.append('5')
                continue
            elif ch in '六陸陆':
                digit_buffer.append('6')
                continue
            elif ch in '七柒':
                digit_buffer.append('7')
                continue
            elif ch in '八捌':
                digit_buffer.append('8')
                continue
            elif ch in '九玖':
                digit_buffer.append('9')
                continue
            else:
                region_end = i
                break
            # end-of-if
        # end-of-for

        if region_end == None:
            region_end = sentence.length()
        # end-of-if

        if region_end > region_start:
            entity = sentence[region_start:region_end]
            entity = "".join(entity)
            concept_values = {
                'tagger': self.__class__,
                'value': int(''.join(digit_buffer)),
            }

            concept = Concept(region_start, region_end, entity, self.__CLASS,
                              concept_values)
            sentence.add_concept(concept)
        # end-of-if

        # not meet the ending, continue to tag?
        if region_end < sentence.length():
            if region_start == region_end:
                # not found the entity
                self._tag(sentence, region_end + 1)
            else:
                # found the entity
                self._tag(sentence, region_end)
def main():

    # f_fasttext_vec_file = 'wiki.en.align.vec'
    # e_fasttext_vec_file = 'wiki.ro.align.vec'
    f_fasttext_model_file = 'en-ro-bpe-16K-en.bin'
    e_fasttext_model_file = 'en-ro-bpe-16K-ro.bin'

    f_fasttext_model_file = 'en_embedding_fasttext_model.bin'
    e_fasttext_model_file = 'ro_embedding_fasttext_model.bin'

    concept_count_v2 = 'weights/en-ro-bpe-count-p_concept-v2.weight'
    # concept_viterbi_v2 = 'weights/en-ro-viterbi-p_concept-v2.weight'
    # concept_activeset_v2 = 'weights/en-ro-activeset-p_concept-v2.weight'
    concept_init_weight_file = concept_viterbi_v2
    concept = Concept(get_pair_hashcode)
    concept.load_p_concept(concept_init_weight_file)

    emalgo = EmAlgo()
    active_set = ActiveSet(20)


    train_dataset = AlignDataset(f_train_filename, e_train_filename)
    test_dataset = AlignDataset(f_test_filename, e_test_filename)


    # f_pretrained_model = load_vectors(f_fasttext_vec_file)
    # e_pretrained_model = load_vectors(e_fasttext_vec_file)
    # embedding_model = FastTextPretrainedEmbedding(f_pretrained_model, e_pretrained_model)

    # embedding_model = load_fasttext_embedding_model(f_fasttext_model_file, e_fasttext_model_file)
    # embedding_model = BertEmbedding()
    embedding_model = load_fasttext_embedding_model(f_fasttext_model_file, e_fasttext_model_file)

    #embedding_model = FastTextEmbedding(f_model, e_model)
    network = embedding.Align(embedding_model)
    sgd = SGD(network.parameters(), lr=0.0005)

    score_before_init = evaluate(emalgo.align_wedding, 
                                 concept, 
                                 test_dataset.src_sentences, 
                                 test_dataset.trg_sentences, 
                                 true_label_file, 
                                 threshld=0.1,
                                 bpe=True,
                                 src_idx=src_bpe_idx_data,
                                 trg_idx=trg_bpe_idx_data)
    print('wedding concept score before init: ', score_before_init)

    score_before_init = evaluate(emalgo.align_wedding, 
                                 embedding_model,
                                 test_dataset.src_sentences, 
                                 test_dataset.trg_sentences, 
                                 true_label_file, 
                                 threshld=0.1,
                                 bpe=True,
                                 src_idx=src_bpe_idx_data,
                                 trg_idx=trg_bpe_idx_data)
    print('wedding embedding score before init : ', score_before_init)

    start_by_step_e(train_dataset, concept, emalgo.align_wedding, embedding_model, sgd, 100)
    
    score_after_init = evaluate(active_set.align_sparsemap, 
                                embedding_model,
                                test_dataset.src_sentences, 
                                test_dataset.trg_sentences, 
                                true_label_file, 
                                threshld=0.2,
                                bpe=True,
                                src_idx=src_bpe_idx_data,
                                trg_idx=trg_bpe_idx_data)
    print('sparsemap embedding after step e : ', score_after_init)

    sgd = SGD(network.parameters(), lr=0.0001)
    train_network(train_dataset, network, sgd, epoch=2, batch=100)

    thresholds = [i*0.1 for i in range(1,11)]
    score_after_trainings = []
    for thr in thresholds:

        score_after_training = evaluate(active_set.align_sparsemap, 
                                    embedding_model,
                                    test_dataset.src_sentences, 
                                    test_dataset.trg_sentences, 
                                    true_label_file, 
                                    threshld=thr,
                                    bpe=True,
                                    src_idx=src_bpe_idx_data,
                                    trg_idx=trg_bpe_idx_data)
        score_after_trainings.append(score_after_training)

    print('score after training : ', score_after_trainings)
    print('best score after training : ', np.min(score_after_trainings))

    
    out_parameter_file = 'embedding_weight.txt'
    np.savetxt('out_parameter_file' , embedding_model.weight.detach().numpy())
    print('embedding weight saved to : ', out_parameter_file)