def make_basemap(basemap): try: set_status('getting document list', model=basemap) with ManagedSession() as session: filtered_query = create_query_for_model(session, basemap, dirty=False) extracted_terms = extract_terms(filtered_query, basemap.term_type) if not extracted_terms: raise Exception('No documents found matching query!') map_dict, graph_terms, phrase_frequencies, unnormed_dict, phrase_scores = map_representation(extracted_terms, ranking_algorithm=basemap.ranking_algorithm, similarity_algorithm=basemap.similarity_algorithm, filtering_algorithm=basemap.filtering_algorithm, number_of_terms=basemap.number_of_terms, model=basemap) # map_string will be a graphviz-processable string # map_string = write_dot.output_pairs_dict(map_dict, True, phrase_frequencies=phrase_frequencies, true_scaling=True).decode('ascii', 'ignore') map_string = write_dot.output_pairs_dict(map_dict, True, phrase_frequencies=phrase_frequencies, true_scaling=True, similarities=unnormed_dict, phrase_scores=phrase_scores).decode('ascii', 'ignore') # save to database basemap.dot_rep = map_string # basemap.phrase_frequencies = json.dumps(jsonize_phrase_dict(phrase_frequencies), indent=4).decode('ascii', 'ignore') # get phrases as a list of lists of strings (one list of words per term) basemap.phrases_in_map = json.dumps(jsonize_phrase_set(graph_terms, None)).decode('ascii', 'ignore') basemap.save() svg_str, width, height = strip_dimensions(call_graphviz(map_string, file_format='svg', model=basemap)) basemap.svg_rep = svg_str basemap.width = width basemap.height = height basemap.finished = True basemap.save() set_status('basemap complete', model=basemap) print 'basemap complete' return map_dict, graph_terms except ZeroDivisionError as e: set_status('Error: too few documents to produce a map. Try a broader search', model=basemap)
def name_like_top(cls, name_like, n=10): with ManagedSession() as session: try: return session.query(Author, func.count(author_grant_table.c.grant_id).label('doc_count'))\ .filter(Author.name.like(name_like)).join(author_grant_table).group_by(Author).order_by('doc_count DESC').slice(0, n).all() except: session.rollback() raise
def load_memo_from_database(Class): """Class must have a name attribute""" with ManagedSession() as session: memo = {} print 'for class', Class query = session.query(Class) print 'loading %d records already in database' % query.count() for row in query: if row.name in memo: print "warning: found duplicate %s" % row.name memo[row.name] = row return memo
def load_memo_from_database(Class): memo = {} with ManagedSession() as session: print 'for class', Class print 'loading %d records already in database' % session.query( Class).count() for row in sliced_query(session.query(Class)): hashed = row.uuid() if hashed in memo: print "warning: found duplicate %s" % hashed memo[hashed] = row return memo
def name_like_top(cls, name_like, n=10): with ManagedSession() as session: try: stmt = session.query(Grant.institution_id, func.count('*').label('doc_count'))\ .group_by(Grant.institution_id)\ .subquery() return session.query(Institution, stmt.c.doc_count)\ .filter(Institution.name.like(name_like))\ .outerjoin(stmt, Institution.id == stmt.c.institution_id)\ .order_by('doc_count DESC').slice(0, n).all() except: session.rollback() raise
def name_like_top(cls, name_like, n=10): with ManagedSession() as session: try: stmt = session.query(Document.conference_id, func.count('*').label('doc_count'))\ .group_by(Document.conference_id)\ .subquery() return session.query(Conference, stmt.c.doc_count)\ .filter(Conference.name.like(name_like))\ .outerjoin(stmt, Conference.id == stmt.c.conference_id)\ .order_by('doc_count DESC').slice(0, n).all() except: session.rollback() raise
def make_heatmap(heatmap, graph_terms): try: set_status('getting document list', model=heatmap) with ManagedSession() as session: filtered_query = create_query_for_model(session, heatmap, dirty=False) extracted_terms = extract_terms(filtered_query, heatmap.term_type) heatmap_terms = flatten(extracted_terms) heatmap_vals = calculate_heatmap_values(heatmap_terms, graph_terms) heatmap.terms = json.dumps(jsonize_phrase_dict(heatmap_vals, 'intensity')) set_status('heatmap complete', model=heatmap) heatmap.finished = True heatmap.save() return heatmap_vals except Exception as e: set_status('Error: %s' % e, model=heatmap) raise e
def filter_document_query(cls, query, names): joined = cls.join_on_documents(query) def _query_for_single_name(session, full_set, name): if session.query(cls).filter(cls.name == name).count() > 0: print 'found exact match for %s' % (name) return full_set.filter(cls.name == name) else: print 'generalizing to %s' % (generalize(name)) return full_set.filter(cls.name.like(generalize(name))) names_split_stripped = filter(lambda s: s, [name.strip() for name in names.split(';')]) print names_split_stripped with ManagedSession() as session: return reduce(lambda x, y: x.union(y), [_query_for_single_name(session, joined, name) for name in names_split_stripped if name])
def load_from_file(filename, offset=None): grants_memo = load_memo_from_database(sdb_db.Grant) author_memo = load_memo_from_database(sdb_db.Author) institution_memo = load_memo_from_database(sdb_db.Institution) present_count = 0 added_count = 0 if offset: print 'starting at row %d' % offset with open(filename) as f, ManagedSession() as session: reader = DictReader(f, delimiter=",") for csv_fields in (reader if offset is None else drop(offset, reader)): grant = grant_from_csv(csv_fields) if grant.uuid() in grants_memo: present_count += 1 continue update_terms(grant) # print grant authors = authors_from_csv(csv_fields) if authors: mem_auths = [ memoized_row(author_memo, author) for author in authors ] # if len(mem_auths) != len(authors): # print 'fewer memoized authors!' # import pprint # print pprint.pprint(csv_fields) # if len(set(mem_auths)) != len(mem_auths): # print 'duplicate memoized authors!' # import pprint # print pprint.pprint(csv_fields) for author in set(mem_auths): grant.authors.append(author) institution = institution_from_csv(csv_fields) if institution: grant.institution = memoized_row(institution_memo, institution) session.add(grant) added_count += 1 grants_memo[grant.uuid()] = grant if (added_count % 1000 == 0): session.commit() print '%s more records added' % added_count session.commit() print '-----------------------' print '%s records were added' % added_count print '%s records already in the db' % present_count print '%s total records parsed' % (added_count + present_count)
def make_heatmap(heatmap, graph_terms): try: set_status('getting document list', model=heatmap) with ManagedSession() as session: heatmap_query= create_query(session, author=heatmap.author, institution=heatmap.institution) filtered_query = filter_query(heatmap_query, dirty=False, starting_year=heatmap.starting_year, ending_year=heatmap.ending_year, sample_size=heatmap.sample_size, model=heatmap) extracted_terms = extract_terms(filtered_query, heatmap.term_type) heatmap_terms = flatten(extracted_terms) heatmap_vals = calculate_heatmap_values(heatmap_terms, graph_terms) heatmap.terms = json.dumps(jsonize_phrase_dict(heatmap_vals, 'intensity')) set_status('heatmap complete', model=heatmap) heatmap.finished = True heatmap.save() return heatmap_vals except Exception as e: set_status('Error: %s' % e, model=heatmap) raise e
if __name__ == '__main__': # mapping of author, journal, and conference names to existing rows in database author_memo = load_memo_from_database(db.Author) journal_memo = load_memo_from_database(db.Journal) conference_memo = load_memo_from_database(db.Conference) count = 0 data = {} # state machine to read doc info. elem is only returned upon reading the end of a # tag, so if we've read one of the top level categories, we check to see # what info we've stored about title, year, author etc (since these tags # have ended inside the outer document tag). We store these inner tags in # the data dictionary, and be sure to clear this upon reaching the end of a # tag so that attributes don't carry over from one document to another. with ManagedSession() as session: for event, elem in tree.iterparse(sys.stdin): # check to see if we've reached the end of a document tag if elem.tag in CATEGORIES: # store attribute info, do preprocessing if necessary title = data.get('title') year = data.get('year') author_names = data.get('author_names', []) journal_name = data.get('journal_name') conference_name = data.get('conference_name') # clear out attribute info, and write data = {} doc = db.Document(title=title, year=year) # if this item has a title, memoize the terms and check if it's # clean (aka usable)