def load_clusters(): """Load info from topics.txt file into Cluster, TermCluster tables""" # Delete whatever's in the db already Cluster.query.delete() TermCluster.query.delete() count_clusters = 0 for row in open("topics.csv"): row = row.rstrip().split(",") # Parse the txt into the appropriate data types for seeding cluster = int(row[1][-3:]) word = row[3].strip() # Check if word is in our list of key terms. If it is, add to # TermCluster table to allow for lookup later (see seed.py for TODO) if Term.check_for_term(word) is True: term_cluster_to_add = TermCluster(word=word, cluster_id=cluster) db.session.add(term_cluster_to_add) db.session.commit() # Check if a cluster is in our list of clusters. If it's not, add it. if Cluster.check_for_cluster(cluster) is False: cluster_to_add = Cluster(cluster_id=cluster) db.session.add(cluster_to_add) db.session.commit() # Print where we are and increment counter print "Topics.txt seeding row", count_clusters count_clusters += 1
def load_studies_terms(): """Loads info from studies_terms.txt into StudyTerm & Term tables. File format: R ID \t pmid \t word \t frequency Source: Neurosynth features.txt, transformed in R to long format.""" print "Studies_terms.txt seeding" # Delete all rows in existing tables, so if we need to run this a second time, # we won't be trying to add duplicate users StudyTerm.query.delete() Term.query.delete() skip = True count_studies_terms = 0 studies_terms = open("seed_data/studies_terms.txt") for row in studies_terms: # Skip the first line of the file if skip: skip = False continue # Stop after 5000 lines # if count_studies_terms > 5000: # break # Parse txt file and convert to appropriate data types for seeding row = row.rstrip().split('\t') # If the term starts with "X", it is not a word but a number, e.g. "X01" # These don't make sense to track, so skip these rows. if row[2].startswith('\"X'): continue # Skip the lines indicating that a term did not appear anywhere # in the article (frequency of 0) if float(row[3]) == 0.0: continue pmid = int(row[1]) word = row[2].strip('\"').replace(".", " ") freq = float(row[3]) # Check if the word is already in Term; if not, add it if Term.check_for_term(word) is False: word_to_add = Term(word=word) db.session.add(word_to_add) # Add the row to the studies_terms table studies_terms_to_add = StudyTerm(word=word, pmid=pmid, frequency=freq) db.session.add(studies_terms_to_add) db.session.commit() # Print where we are and increment counter print "studies_terms.txt seeding row ", count_studies_terms count_studies_terms += 1
def load_clusters(): """Load info from topics.txt file into Cluster, TermCluster tables File format: R row id,Topic XXX,R column ID,word where XXX represents a number between 0-400 R ids can be discarded during seeding Source: topic clustering data from Neurosynth, converted to long format in R prior to seeding. Notes: the words tracked in this clustering are not in perfect alignment with those tracked in studies_terms.txt. Approximately 2000 of the terms in studies_terms have a topical cluster, the remaining ~1000 do not. This number could be improved by stemming. Many of the words not tracked in clusters are multi-word phrases.""" # Delete whatever's in the db already Cluster.query.delete() TermCluster.query.delete() count_clusters = 0 topics_fileobj = open('seed_data/topics.csv') for row in topics_fileobj: row = row.rstrip().split(',') # Parse the txt into the appropriate data types for seeding cluster = int(row[1][-3:]) word = row[3].strip() # Check if word is in our list of key terms. If it is, add to # TermCluster table to allow for lookup later (see model.py for TODO) if Term.check_for_term(word) is True: term_cluster_to_add = TermCluster(word=word, cluster_id=cluster) db.session.add(term_cluster_to_add) db.session.commit() # Check if a cluster is in our list of clusters. If it's not, add it. if Cluster.check_for_cluster(cluster) is False: cluster_to_add = Cluster(cluster_id=cluster) db.session.add(cluster_to_add) db.session.commit() # Print where we are and increment counter print "Topics.txt seeding row", count_clusters count_clusters += 1 topics_fileobj.close()