Exemplo n.º 1
0
def make_basemap(basemap):
    try:
        set_status('getting document list', model=basemap)
        with ManagedSession() as session:
            filtered_query = create_query_for_model(session, basemap, dirty=False)
            extracted_terms = extract_terms(filtered_query, basemap.term_type)
        if not extracted_terms:
            raise Exception('No documents found matching query!')
        map_dict, graph_terms, phrase_frequencies, unnormed_dict, phrase_scores = map_representation(extracted_terms,
                                                                                                     ranking_algorithm=basemap.ranking_algorithm,
                                                                                                     similarity_algorithm=basemap.similarity_algorithm,
                                                                                                     filtering_algorithm=basemap.filtering_algorithm,
                                                                                                     number_of_terms=basemap.number_of_terms,
                                                                                                     model=basemap)
        # map_string will be a graphviz-processable string
        # map_string = write_dot.output_pairs_dict(map_dict, True, phrase_frequencies=phrase_frequencies, true_scaling=True).decode('ascii', 'ignore')
        map_string = write_dot.output_pairs_dict(map_dict, True, phrase_frequencies=phrase_frequencies, true_scaling=True, similarities=unnormed_dict, phrase_scores=phrase_scores).decode('ascii', 'ignore')
        # save to database
        basemap.dot_rep = map_string
        # basemap.phrase_frequencies = json.dumps(jsonize_phrase_dict(phrase_frequencies), indent=4).decode('ascii', 'ignore')
        # get phrases as a list of lists of strings (one list of words per term)
        basemap.phrases_in_map = json.dumps(jsonize_phrase_set(graph_terms, None)).decode('ascii', 'ignore')
        basemap.save()
        svg_str, width, height = strip_dimensions(call_graphviz(map_string, file_format='svg', model=basemap))
        basemap.svg_rep = svg_str
        basemap.width = width
        basemap.height = height
        basemap.finished = True
        basemap.save()
        set_status('basemap complete', model=basemap)
        print 'basemap complete'
        return map_dict, graph_terms
    except ZeroDivisionError as e:
        set_status('Error: too few documents to produce a map. Try a broader search', model=basemap)
Exemplo n.º 2
0
 def name_like_top(cls, name_like, n=10):
     with ManagedSession() as session:
         try:
             return session.query(Author, func.count(author_grant_table.c.grant_id).label('doc_count'))\
                     .filter(Author.name.like(name_like)).join(author_grant_table).group_by(Author).order_by('doc_count DESC').slice(0, n).all()
         except:
             session.rollback()
             raise
Exemplo n.º 3
0
def load_memo_from_database(Class):
    """Class must have a name attribute"""
    with ManagedSession() as session:
        memo = {}
        print 'for class', Class
        query = session.query(Class)
        print 'loading %d records already in database' % query.count()
        for row in query:
            if row.name in memo:
                print "warning: found duplicate %s" % row.name
            memo[row.name] = row
        return memo
Exemplo n.º 4
0
def load_memo_from_database(Class):
    memo = {}
    with ManagedSession() as session:
        print 'for class', Class
        print 'loading %d records already in database' % session.query(
            Class).count()
        for row in sliced_query(session.query(Class)):
            hashed = row.uuid()
            if hashed in memo:
                print "warning: found duplicate %s" % hashed
            memo[hashed] = row
    return memo
Exemplo n.º 5
0
 def name_like_top(cls, name_like, n=10):
     with ManagedSession() as session:
         try:
             stmt = session.query(Grant.institution_id, func.count('*').label('doc_count'))\
                     .group_by(Grant.institution_id)\
                     .subquery()
             return session.query(Institution, stmt.c.doc_count)\
                     .filter(Institution.name.like(name_like))\
                     .outerjoin(stmt, Institution.id == stmt.c.institution_id)\
                     .order_by('doc_count DESC').slice(0, n).all()
         except:
             session.rollback()
             raise
Exemplo n.º 6
0
 def name_like_top(cls, name_like, n=10):
     with ManagedSession() as session:
         try:
             stmt = session.query(Document.conference_id, func.count('*').label('doc_count'))\
                     .group_by(Document.conference_id)\
                     .subquery()
             return session.query(Conference, stmt.c.doc_count)\
                     .filter(Conference.name.like(name_like))\
                     .outerjoin(stmt, Conference.id == stmt.c.conference_id)\
                     .order_by('doc_count DESC').slice(0, n).all()
         except:
             session.rollback()
             raise
Exemplo n.º 7
0
def make_heatmap(heatmap, graph_terms):
    try:
        set_status('getting document list', model=heatmap)
        with ManagedSession() as session:
            filtered_query = create_query_for_model(session, heatmap, dirty=False)
            extracted_terms = extract_terms(filtered_query, heatmap.term_type)
        heatmap_terms = flatten(extracted_terms)
        heatmap_vals = calculate_heatmap_values(heatmap_terms, graph_terms)
        heatmap.terms = json.dumps(jsonize_phrase_dict(heatmap_vals, 'intensity'))
        set_status('heatmap complete', model=heatmap)
        heatmap.finished = True
        heatmap.save()
        return heatmap_vals
    except Exception as e:
        set_status('Error: %s' % e, model=heatmap)
        raise e
Exemplo n.º 8
0
    def filter_document_query(cls, query, names):
        joined = cls.join_on_documents(query)
        def _query_for_single_name(session, full_set, name):
            if session.query(cls).filter(cls.name == name).count() > 0:
                print 'found exact match for %s' % (name)
                return full_set.filter(cls.name == name)
            else:
                print 'generalizing to %s' % (generalize(name))
                return full_set.filter(cls.name.like(generalize(name)))

        names_split_stripped = filter(lambda s: s, [name.strip() for name in names.split(';')])
        print names_split_stripped
        with ManagedSession() as session:
            return reduce(lambda x, y: x.union(y),
                          [_query_for_single_name(session, joined, name)
                          for name in names_split_stripped
                           if name])
Exemplo n.º 9
0
def load_from_file(filename, offset=None):
    grants_memo = load_memo_from_database(sdb_db.Grant)
    author_memo = load_memo_from_database(sdb_db.Author)
    institution_memo = load_memo_from_database(sdb_db.Institution)
    present_count = 0
    added_count = 0
    if offset:
        print 'starting at row %d' % offset
    with open(filename) as f, ManagedSession() as session:
        reader = DictReader(f, delimiter=",")
        for csv_fields in (reader if offset is None else drop(offset, reader)):
            grant = grant_from_csv(csv_fields)
            if grant.uuid() in grants_memo:
                present_count += 1
                continue
            update_terms(grant)
            # print grant
            authors = authors_from_csv(csv_fields)
            if authors:
                mem_auths = [
                    memoized_row(author_memo, author) for author in authors
                ]
                # if len(mem_auths) != len(authors):
                #     print 'fewer memoized authors!'
                #     import pprint
                #     print pprint.pprint(csv_fields)
                # if len(set(mem_auths)) != len(mem_auths):
                #     print 'duplicate memoized authors!'
                #     import pprint
                #     print pprint.pprint(csv_fields)
                for author in set(mem_auths):
                    grant.authors.append(author)
            institution = institution_from_csv(csv_fields)
            if institution:
                grant.institution = memoized_row(institution_memo, institution)
            session.add(grant)
            added_count += 1
            grants_memo[grant.uuid()] = grant
            if (added_count % 1000 == 0):
                session.commit()
                print '%s more records added' % added_count
        session.commit()
        print '-----------------------'
        print '%s records were added' % added_count
        print '%s records already in the db' % present_count
        print '%s total records parsed' % (added_count + present_count)
Exemplo n.º 10
0
def make_heatmap(heatmap, graph_terms):
    try:
        set_status('getting document list', model=heatmap)
        with ManagedSession() as session:
            heatmap_query= create_query(session, author=heatmap.author, institution=heatmap.institution)
            filtered_query = filter_query(heatmap_query, dirty=False,
                                          starting_year=heatmap.starting_year,
                                          ending_year=heatmap.ending_year,
                                          sample_size=heatmap.sample_size,
                                        model=heatmap)
            extracted_terms = extract_terms(filtered_query, heatmap.term_type)
        heatmap_terms = flatten(extracted_terms)
        heatmap_vals = calculate_heatmap_values(heatmap_terms, graph_terms)
        heatmap.terms = json.dumps(jsonize_phrase_dict(heatmap_vals, 'intensity'))
        set_status('heatmap complete', model=heatmap)
        heatmap.finished = True
        heatmap.save()
        return heatmap_vals
    except Exception as e:
        set_status('Error: %s' % e, model=heatmap)
        raise e
Exemplo n.º 11
0
if __name__ == '__main__':
    # mapping of author, journal, and conference names to existing rows in database
    author_memo = load_memo_from_database(db.Author)
    journal_memo = load_memo_from_database(db.Journal)
    conference_memo = load_memo_from_database(db.Conference)

    count = 0

    data = {}
    # state machine to read doc info. elem is only returned upon reading the end of a
    # tag, so if we've read one of the top level categories, we check to see
    # what info we've stored about title, year, author etc (since these tags
    # have ended inside the outer document tag). We store these inner tags in
    # the data dictionary, and be sure to clear this upon reaching the end of a
    # tag so that attributes don't carry over from one document to another.
    with ManagedSession() as session:
        for event, elem in tree.iterparse(sys.stdin):
            # check to see if we've reached the end of a document tag
            if elem.tag in CATEGORIES:
                # store attribute info, do preprocessing if necessary
                title = data.get('title')
                year = data.get('year')
                author_names = data.get('author_names', [])
                journal_name = data.get('journal_name')
                conference_name = data.get('conference_name')

                # clear out attribute info, and write
                data = {}
                doc = db.Document(title=title, year=year)
                # if this item has a title, memoize the terms and check if it's
                # clean (aka usable)