示例#1
0
def write_constraints_test():
    flags.InitFlags()

    # read in vocab
    vocab = getVocab(flags.vocab)
    print "Read vocabulary"

    # read in constraints
    ml_cons, cl_cons = readConstraints(flags.constraints)

    # Merge constraints
    if flags.merge_constraints:
        ml_updated, cl_updated = mergeAllConstraints(ml_cons, cl_cons)
    #else:
    #       ml_updated = map(lambda x: ['ML_', set(x)], ml)
    #       cl_updated = map(lambda x: ['CL_', set(x)], cl)

    # Constraints counts
    constraints_count = getConstraintCount(ml_updated, cl_updated)

    # get constraint vocab
    constraint_words = constraints_count.keys()

    # Check constraints
    check = list(x for x in constraint_words if x not in vocab)
    assert not check, "Constraints were not in vocab: %s" % ", ".join(check)

    # Remained word list
    remained_words = list(x for x in vocab if not (x in constraint_words))
        term = ii.split("\t")[1].strip()

        word = voc.terms.add()
        word.id = line_num
        word.original = term
        word.ascii = term.encode("ascii", "replace")
        word.frequency = 0
        word.stop_word = False
        lookup[line_num] = term

        line_num += 1
    return c, lookup


if __name__ == "__main__":
    flags.InitFlags()

    doc_num = 0
    index_num = -1
    record_num = 0

    # Create the doc filter if we need it
    doc_filter = {}
    if flags.doc_filter:
        for ii in open(flags.doc_filter):
            if ii.startswith("Doc#"):  # Ignore header
                continue
            fields = ii.split("\t")
            doc_filter[int(fields[0])] = fields[-1].strip()

    o_state = open("%s.topic_assignments" % flags.state_output, 'w')
示例#3
0
def write_constraints_old():
    flags.InitFlags()

    # read in vocab
    vocab = getVocab(flags.vocab)
    print "Read vocabulary"

    # read in constraints
    ml_cons, cl_cons = readConstraints(flags.constraints)

    # Merge constraints
    if flags.merge_constraints:
        ml_updated, cl_updated = mergeAllConstraints(ml_cons, cl_cons)
    #else:
    #       ml_updated = map(lambda x: ['ML_', set(x)], ml)
    #       cl_updated = map(lambda x: ['CL_', set(x)], cl)

    # Constraints counts
    constraints_count = getConstraintCount(ml_updated, cl_updated)

    # get constraint vocab
    constraint_words = constraints_count.keys()

    # Check constraints
    check = list(x for x in constraint_words if x not in vocab)
    assert not check, "Constraints were not in vocab: %s" % ", ".join(check)

    # Remained word list
    remained_words = list(x for x in vocab if not (x in constraint_words))

    #########################################################
    print flags.wnname
    wnname = flags.wnname
    o = OntologyWriter(wnname)

    if len(remained_words) > 0:
        # +1 is for one synset for the unconstratined words
        num_children = len(ml_updated) + len(cl_updated) + 1
    else:
        num_children = len(ml_updated) + len(cl_updated)

    o.AddSynset(0, "ROOT", xrange(1, num_children + 1), [])
    allocated_index = num_children

    # Add ML constraints
    rcIndex = 0
    ml_count = 0
    for cons in ml_updated:
        rcIndex += 1
        ml_count += 1
        name = cons[0] + "%i_%s" % (ml_count, ":".join(cons[1])[:20])

        wordset = []
        for word in cons[1]:
            lang = ENGLISH
            num = 1.0 / constraints_count[word]
            wordset.append((lang, word, num))

        o.AddSynset(rcIndex, name, [], wordset)

    print("Added %i ML constraint nodes" % (ml_count))

    # Add CL constraints
    cl_count = 0
    nl_count = 0
    nl_in_count = 0
    for cons in cl_updated:
        rcIndex += 1
        cl_count += 1

        name = cons[0] + "%i" % (cl_count)
        children_count = len(cons[1])
        start = allocated_index + 1
        o.AddSynset(rcIndex, name, xrange(start, start + children_count), [])
        clcIndex = allocated_index
        allocated_index += children_count

        for clique in cons[1]:

            if re.search('^NL_IN_$', clique[0]):  # four levels tree
                nl_in_count += 1
                name = clique[0] + "%i" % (nl_in_count)
                nl_in_child_count = len(clique[1])
                clcIndex += 1
                start = allocated_index + 1
                o.AddSynset(clcIndex, name, \
                            xrange(start, start + nl_in_child_count), [])
                nl_in_cIndex = allocated_index
                allocated_index += nl_in_child_count

                for in_clique in clique[1]:

                    if re.search('^ML_$', in_clique[0]):
                        ml_count += 1
                        name = in_clique[0] + "%i_%s" % \
                               (ml_count, ":".join(in_clique[1])[:20])
                    else:  # re.search('^NL_$', clique[0]):
                        nl_count += 1
                        name = in_clique[0] + "%i_%s" % \
                               (nl_count, ":".join(in_clique[1])[:20])

                    wordset = []
                    for word in in_clique[1]:
                        lang = ENGLISH
                        num = 1.0 / constraints_count[word]
                        wordset.append((lang, word, num))

                    nl_in_cIndex += 1
                    o.AddSynset(nl_in_cIndex, name, [], wordset)

            else:  # three levels tree
                if re.search('^ML_$', clique[0]):
                    ml_count += 1
                    name = clique[0] + "%i_%s" % \
                           (ml_count, ":".join(clique[1])[:20])
                else:  # re.search('^NL_$', clique[0]):
                    nl_count += 1
                    name = clique[0] + "%i_%s" % \
                           (nl_count, ":".join(clique[1])[:20])

                wordset = []
                for word in clique[1]:
                    lang = ENGLISH
                    num = 1.0 / constraints_count[word]
                    wordset.append((lang, word, num))

                clcIndex += 1
                o.AddSynset(clcIndex, name, [], wordset)

    # Unused words
    if len(remained_words) > 0:
        wordset = []
        for word in remained_words:
            lang = ENGLISH
            num = 1
            wordset.append((lang, word, num))

        name = "NL_REMAINED_"
        rcIndex += 1
        o.AddSynset(rcIndex, name, [], wordset)

    print("Added %i total nodes for vocab" % rcIndex)

    assert rcIndex == num_children, "Mismatch of children %i %i" \
                   % (rcIndex, num_children)

    # Add root
    o.Finalize()
示例#4
0
def write_constraints():
    flags.InitFlags()

    # read in vocab
    vocab = getVocab(flags.vocab)
    print "Read vocabulary"

    # read in constraints
    ml_cons, cl_cons = readConstraints(flags.constraints)

    # Merge constraints
    if flags.merge_constraints:
        ml_updated, cl_updated = mergeAllConstraints(ml_cons, cl_cons)
    #else:
    #       ml_updated = map(lambda x: ['ML_', set(x)], ml)
    #       cl_updated = map(lambda x: ['CL_', set(x)], cl)
    print ml_updated
    print cl_updated

    # Constraints counts
    constraints_count = getConstraintCount(ml_updated, cl_updated)

    # get constraint vocab
    constraint_words = constraints_count.keys()

    # Check constraints
    check = list(x for x in constraint_words if x not in vocab)
    assert not check, "Constraints were not in vocab: %s" % ", ".join(check)

    # Remained word list
    remained_words = list(x for x in vocab if not (x in constraint_words))

    #########################################################
    print flags.wnname
    wnname = flags.wnname
    o = OntologyWriter(wnname)

    if len(remained_words) > 0:
        # +1 is for one synset for the unconstratined words
        num_children = len(ml_updated) + len(cl_updated) + 1
    else:
        num_children = len(ml_updated) + len(cl_updated)

    o.AddSynset(0, "ROOT", xrange(1, num_children + 1), [])
    allocated_index = num_children

    rootchild_index = 0
    leaf_count = 0

    # Add ML constraints
    for cons in ml_updated:
        [rootchild_index, leaf_count, allocated_index] = write_internal_nodes\
        (cons, rootchild_index, leaf_count, allocated_index, o, constraints_count)

    # Add CL constraints
    for cons in cl_updated:
        [rootchild_index, leaf_count, allocated_index] = write_internal_nodes\
        (cons, rootchild_index, leaf_count, allocated_index, o, constraints_count)

    # Add Unused words
    if len(remained_words) > 0:
        remained = ["NL_REMAINED_", remained_words]
        [rootchild_index, leaf_count, allocated_index] = write_leaf\
        (remained, rootchild_index, leaf_count, allocated_index, o, constraints_count)

    print("Added %i total nodes for vocab" % rootchild_index)

    assert rootchild_index == num_children, "Mismatch of children %i %i" \
                   % (rootchild_index, num_children)

    print "Number of leaves:", leaf_count

    # Add root
    o.Finalize()