Exemplo n.º 1
0
    def get_documents_matrix(self):
        """
        Get a matrix of documents vs. concepts.

        This is temporarily cached (besides what StudyDir does) because it
        will be needed multiple times in analyzing a study.

        FIXME: try to make canonical documents not change the results
        """
        self._step('Building document matrix...')
        if self.num_documents == 0:
            assert False
            return None
        if self._documents_matrix is not None:
            return self._documents_matrix
        entries = []
        for doc in self.study_documents:
            self._step(doc.name)
            for concept, value in doc.extract_concepts_with_negation()[:1000]:
                if (concept not in PUNCTUATION) and (not en_nl.is_blacklisted(concept)):
                    entries.append((value, doc.name, concept))
        documents_matrix = divisi2.make_sparse(entries).normalize_tfidf(cols_are_terms=True)
        canon_entries = []
        for doc in self.canonical_documents:
            self._step(doc.name)
            for concept, value in doc.extract_concepts_with_negation()[:1000]:
                if (concept not in PUNCTUATION) and (not en_nl.is_blacklisted(concept)):
                    canon_entries.append((value, doc.name, concept))
        if canon_entries:
            canonical_matrix = divisi2.make_sparse(canon_entries).normalize_rows()
            self._documents_matrix = documents_matrix + canonical_matrix
        else:
            self._documents_matrix = documents_matrix
        return self._documents_matrix
Exemplo n.º 2
0
    def get_documents_matrix(self):
        """
        Get a matrix of documents vs. concepts.

        This is temporarily cached (besides what StudyDir does) because it
        will be needed multiple times in analyzing a study.

        FIXME: try to make canonical documents not change the results
        """
        self._step('Building document matrix...')
        if self.num_documents == 0:
            assert False
            return None
        if self._documents_matrix is not None:
            return self._documents_matrix
        entries = []
        for doc in self.study_documents:
            self._step(doc.name)
            for concept, value in doc.extract_concepts_with_negation()[:1000]:
                if (concept not in PUNCTUATION) and (
                        not en_nl.is_blacklisted(concept)):
                    entries.append((value, doc.name, concept))
        documents_matrix = divisi2.make_sparse(entries).normalize_tfidf(
            cols_are_terms=True)
        canon_entries = []
        for doc in self.canonical_documents:
            self._step(doc.name)
            for concept, value in doc.extract_concepts_with_negation()[:1000]:
                if (concept not in PUNCTUATION) and (
                        not en_nl.is_blacklisted(concept)):
                    canon_entries.append((value, doc.name, concept))
        if canon_entries:
            canonical_matrix = divisi2.make_sparse(
                canon_entries).normalize_rows()
            self._documents_matrix = documents_matrix + canonical_matrix
        else:
            self._documents_matrix = documents_matrix
        return self._documents_matrix
Exemplo n.º 3
0
def build_matrix(query,
                 cutoff=DEFAULT_CUTOFF,
                 identity_weight=DEFAULT_IDENTITY_WEIGHT,
                 data_source=conceptnet_quads,
                 transform=to_value_concept_feature):
    """
    Builds a Divisi2 SparseMatrix from relational data.

    One required argument is the `query`, which can be a QuerySet or just a
    language identifier.

    Optional arguments:

    - `cutoff`: specifies how common a concept has to be to appear in the
      matrix. Defaults to DEFAULT_CUTOFF=5.
    - `identity_weight`
    - `data_source`: a function that produces (concept1, rel, concept2, value)
      quads given the `query` and `cutoff`. Defaults to
      :meth:`conceptnet_quads`.
    - `transform`: the function for transforming quads into
      (value, row_name, column_name) triples. Defaults to
      :meth:`to_value_concept_feature`, which yields
      (value, concept, feature) triples.
    """
    logger.info("Performing ConceptNet query")
    quads = list(data_source(query, cutoff))
    # todo: separate this out into a customizable function

    if identity_weight > 0:
        logger.info("Adding identities")
        morequads = []
        concept_set = set(q[0] for q in quads)
        for concept in concept_set:
            morequads.append(
                (concept, 'InheritsFrom', concept, identity_weight))
        for c1, rel, c2, val in quads:
            if rel == 'IsA':
                morequads.append((c1, 'InheritsFrom', c1, val))
        quads.extend(morequads)

    logger.info("Creating triples")
    triples = transform(quads)
    logger.info("Building matrix")
    matrix = divisi2.make_sparse(triples)
    logger.info("Squishing underused rows")
    return matrix.squish(cutoff)
Exemplo n.º 4
0
    def get_documents_matrix(self):
        """
        Get a matrix of documents vs. concepts.

        This is temporarily cached (besides what StudyDir does) because it
        will be needed multiple times in analyzing a study.
        """
        self._step('Building document matrix...')
        if self.num_documents == 0: return None
        if self._documents_matrix is not None:
            return self._documents_matrix
        entries = []
        for doc in self.documents:
            for concept, value in doc.extract_concepts_with_negation():
                if (concept not in PUNCTUATION) and (not en_nl.is_blacklisted(concept)):
                    entries.append((value, doc.name, concept))
        self._documents_matrix = divisi2.make_sparse(entries)
        return self._documents_matrix
Exemplo n.º 5
0
def make_divisi_matrix(filename):
    parsedlist = inform_parser(filename)
    game = filename.split('.')[0]
    thinglist = [(1 if x[3] else -1, english.normalize(x[0].replace('^', "'")), ('right', x[1], english.normalize(x[2].replace('^', "'")))) for x in parsedlist]
    # Write out the confusingly-named overlist. First, the nouns.
    overlist = open(game + '.over', 'w')
    for concept1, rel, concept2, val in parsedlist:
        if rel == 'HasProperty' and concept2 == 'mark_as_thing':
            print >> overlist, concept1
            print concept1

    # Now the verbs.
    verbs = verb_reader(filename)
    for verb in verbs:
        print >> overlist, verb
    overlist.close()

    game_matrix = divisi2.make_sparse(thinglist).normalize_all()
    divisi2.save(game_matrix, game + '.pickle')
    return game_matrix
Exemplo n.º 6
0
def build_matrix(query, cutoff=DEFAULT_CUTOFF, identity_weight=DEFAULT_IDENTITY_WEIGHT, data_source=conceptnet_quads, transform=to_value_concept_feature):
    """
    Builds a Divisi2 SparseMatrix from relational data.

    One required argument is the `query`, which can be a QuerySet or just a
    language identifier.

    Optional arguments:

    - `cutoff`: specifies how common a concept has to be to appear in the
      matrix. Defaults to DEFAULT_CUTOFF=5.
    - `identity_weight`
    - `data_source`: a function that produces (concept1, rel, concept2, value)
      quads given the `query` and `cutoff`. Defaults to
      :meth:`conceptnet_quads`.
    - `transform`: the function for transforming quads into
      (value, row_name, column_name) triples. Defaults to
      :meth:`to_value_concept_feature`, which yields
      (value, concept, feature) triples.
    """
    logger.info("Performing ConceptNet query")
    quads = list(data_source(query, cutoff))
    # todo: separate this out into a customizable function
    
    if identity_weight > 0:
        logger.info("Adding identities")
        morequads = []
        concept_set = set(q[0] for q in quads)
        for concept in concept_set:
            morequads.append( (concept, 'InheritsFrom', concept, identity_weight) )
        for c1, rel, c2, val in quads:
            if rel == 'IsA':
                morequads.append( (c1, 'InheritsFrom', c1, val) )
        quads.extend(morequads)

    logger.info("Creating triples")
    triples = transform(quads)
    logger.info("Building matrix")
    matrix = divisi2.make_sparse(triples)
    logger.info("Squishing underused rows")
    return matrix.squish(cutoff)
Exemplo n.º 7
0
from csc import divisi2
import cPickle as pickle

mat_4x3 = divisi2.make_sparse([(2, "apple", "red"), (2, "orange", "orange"),
                               (1, "apple", "green"), (1, "celery", "green"),
                               (-1, "apple", "orange"),
                               (-1, "banana", "orange")])


def pickle_bounce(obj):
    s = pickle.dumps(obj)
    objcopy = pickle.loads(s)
    return objcopy


def test_sparse_pickle():
    mat2 = divisi2.SparseMatrix.from_state(mat_4x3.to_state())
    assert mat2 == mat_4x3
    assert pickle_bounce(mat_4x3) == mat_4x3
    assert pickle_bounce(mat_4x3[0]) == mat_4x3[0]
    assert pickle_bounce(mat_4x3[:, 0]) == mat_4x3[:, 0]


def test_dense_pickle():
    dmat = mat_4x3.to_dense()
    assert pickle_bounce(dmat).equals(dmat)
    assert pickle_bounce(dmat[0]).equals(dmat[0])
    assert pickle_bounce(dmat[:, 0]).equals(dmat[:, 0])
Exemplo n.º 8
0
 def make_svd(self):
     matrix = divisi2.make_sparse(self.data).normalize_all()
     self.U,self.s,self.V = matrix.svd(k=14)
     self.predictions = divisi2.reconstruct_activation(self.V, self.s)
     del self.data
     print "init end"
Exemplo n.º 9
0
from csc import divisi2
import cPickle as pickle

mat_4x3 = divisi2.make_sparse([
    (2, "apple", "red"),
    (2, "orange", "orange"),
    (1, "apple", "green"),
    (1, "celery", "green"),
    (-1, "apple", "orange"),
    (-1, "banana", "orange")
])

def pickle_bounce(obj):
    s = pickle.dumps(obj)
    objcopy = pickle.loads(s)
    return objcopy

def test_sparse_pickle():
    mat2 = divisi2.SparseMatrix.from_state(mat_4x3.to_state())
    assert mat2 == mat_4x3
    assert pickle_bounce(mat_4x3) == mat_4x3
    assert pickle_bounce(mat_4x3[0]) == mat_4x3[0]
    assert pickle_bounce(mat_4x3[:,0]) == mat_4x3[:,0]

def test_dense_pickle():
    dmat = mat_4x3.to_dense()
    assert pickle_bounce(dmat).equals(dmat)
    assert pickle_bounce(dmat[0]).equals(dmat[0])
    assert pickle_bounce(dmat[:,0]).equals(dmat[:,0])

Exemplo n.º 10
0
	cursor.execute(str % args)
	ret = cursor.fetchall()
	cursor.close()
	return ret

f = open('../includes/config.yaml')
config = yaml.load(f)
f.close()

conn = MySQLdb.connect(config['dbhost'],config['dbuser'],config['dbpass'] or '',config['dbname'])

courses = db_query(conn, "SELECT * FROM comments WHERE parent='1'", ())
data = []
for c in courses:
	topics = db_query(conn, "SELECT * FROM comments WHERE parent='%d'", (c[0]))
	for t in topics:
		data.append( (1, c[0], t[1]) )
mat = divisi2.make_sparse(data)
mat = mat.normalize_rows()
mat_t = mat.T
mult = divisi2.matrixmultiply(mat, mat_t)
print mult

similarities = mult.named_entries()
for s in similarities:
	v,c1,c2 = s
	if c1 != c2:
		db_query(conn, "REPLACE INTO similarities (cid1,cid2,val) VALUE (%d,%d,%f)", (c1,c2,v));

conn.close()