def main(): # either search by university + author or gene + university # search type: # select (1) for university+author, OR # select (2) for university+gene ####PROBABLY NOT GOING TO USE search_type = '1' articles = 1000 university = 'University of Pittsburgh' gene = 'trpml1' Author_first = 'Kirill' Author_last = 'Kiselyov' Author_string = Author_last + '%2C%20' +Author_first + '%5BAuthor%5D' university = get_university(university) if int(search_type) == 1: docs_dict = get_info_from_PubMed(articles, university, Author_string) # docs dict = {PMID:[ 'ABSTRACT_TEXT' , [('FIRST','LAST','Affiliation'),(('FIRST','LAST','Affiliation')]]} elif int(search_type) == 2: gene_field = '{0}%20AND%20' .format(gene) docs_xml = get_info_from_PubMed(articles, university, gene_field , Author_first, Author_last) get_genes.main(docs_dict)
def main(): ################search_type############################ # select (1) for university+author, OR # select (2) for university+gene ####PROBABLY NOT GOING TO USE # select (3) if using a list of authors in one university search_type = '3' if int(search_type) == 1: articles = 10 university = 'University of Pittsburgh' Author_first = 'Catalina' Author_last = 'Cleves Bayon' Author_string = Author_last + '%2C%20' +Author_first + '%5BAuthor%5D' Author = Author_first + ' ' + Author_last current_author_num = 1 tot_authors = 1 university = get_university(university) docs_dict = get_info_from_PubMed(articles, university, Author_string) print 'docs dict', docs_dict entity_frequency_list, tfidf_lists = get_genes.main(docs_dict, Author, current_author_num, tot_authors) if int(search_type) == 2: gene = '' gene_field = '{0}%20AND%20' .format(gene) docs_xml = get_info_from_PubMed(articles, university, gene_field , Author_first, Author_last) if int(search_type) == 3: articles = 40 # file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/Author_list' #Madhavi suggested example file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/SOM_Faculty.tsv' #School of Medicine Faculty # file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/SOM_Faculty_short.tsv' #School of Medicine Faculty, cut short for testing university = 'University of Pittsburgh' university = get_university(university) Author_strings, Authors = get_name_list(file_location) Author_keys = [] for Author in Authors: full_name = Author[0] + ' ' + Author[1] Author_keys.append(full_name) print 'author keys', Author_keys frequency_values = [] for num, author_string in enumerate(Author_strings): indiv_dict = {} tot_authors = len(Author_strings) current_author_num = int(num) + 1 Author_first_last = Authors[num] Author = Author_first_last[0] + ' ' + Author_first_last[1] docs_dict = get_info_from_PubMed(articles, university, author_string) entity_frequency_list, tfidf_lists = get_genes.main(docs_dict, Author, current_author_num, tot_authors) frequency_values.append(entity_frequency_list) indiv_dict[Author_keys[num]] = entity_frequency_list base_name = os.path.basename(file_location) output_file = '/home/adam/workspace/TEES/text_files/Author_Lists/output/{0}' .format(base_name) with open(output_file, 'a') as f: # Just use 'w' mode in 3.x w = csv.writer(f) w.writerows(indiv_dict.items()) tfidf_output_file = output_file + '_tfidf' with open(tfidf_output_file, 'a') as f2: pickle.dump(tfidf_lists, f2) Author_frequency_dict = dict(zip(Author_keys,frequency_values)) # Author_frequency_dict = {'Kirill Kiselyov': [('TRP', 0.148148), ('TRPC', 0.138889), ('TRPML1', 0.12963), ('ML1', 0.092593), ('TRPC2', 0.083333), ('GPCR', 0.027778), ('CatB', 0.027778), ('TRP family', 0.027778), ('TRPML1 KD', 0.009259), ('Ca(2+) release', 0.009259), ('EBP50', 0.009259), ('PLCgamma', 0.009259), ('plasma membrane receptors', 0.009259), ('trp2 mutant', 0.009259), ('proapoptotic protein Bax', 0.009259), ('phospholipase C', 0.009259), ('lysosomal SNARE proteins', 0.009259), ('lysosomal ion homeostasis', 0.009259), ('calmodulin', 0.009259), ('IP(3) receptors', 0.009259), ('mitochondrial Ca2+', 0.009259), ('G protein-coupled receptors', 0.009259), ('InaD', 0.009259), ('synaptotagmin VII', 0.009259), ('VAMP7 KD', 0.009259), ('Ca(2+) release channels', 0.009259), ('caveolin', 0.009259), ('lysosomal enzymes', 0.009259), ('tyrosine kinase receptors', 0.009259), ('caspase', 0.009259), ('TRP2', 0.009259), ('MCOLN1', 0.009259), ('scaffolding proteins', 0.009259), ('lysosomal protease cathepsin B', 0.009259), ('TRPML3', 0.009259), ('transient receptor potential', 0.009259), ('G protein coupled receptors', 0.009259), ('TRPML2', 0.009259), ('VAMP7', 0.009259), ('SYT7', 0.009259), ('Zn(2+) transporters', 0.009259), ('apolipoprotein B hydrolysis in MLIV', 0.009259), ('NEHRF', 0.009259)], 'Madhavi Ganapathiraju': [('ANKLE1', 0.25), ('ORAOV1', 0.25), ('TMEM45B', 0.25), ('human protein', 0.25)]} print Author_frequency_dict
def main(): ################search_type############################ # select (1) for university+author, OR # select (2) for university+gene ####PROBABLY NOT GOING TO USE # select (3) if using a list of authors in one university search_type = '3' if int(search_type) == 1: articles = 10 university = 'University of Pittsburgh' Author_first = 'Catalina' Author_last = 'Cleves Bayon' Author_string = Author_last + '%2C%20' + Author_first + '%5BAuthor%5D' Author = Author_first + ' ' + Author_last current_author_num = 1 tot_authors = 1 university = get_university(university) docs_dict = get_info_from_PubMed(articles, university, Author_string) print 'docs dict', docs_dict entity_frequency_list, tfidf_lists = get_genes.main( docs_dict, Author, current_author_num, tot_authors) if int(search_type) == 2: gene = '' gene_field = '{0}%20AND%20'.format(gene) docs_xml = get_info_from_PubMed(articles, university, gene_field, Author_first, Author_last) if int(search_type) == 3: articles = 40 # file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/Author_list' #Madhavi suggested example file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/SOM_Faculty.tsv' #School of Medicine Faculty # file_location = '/home/adam/workspace/TEES/text_files/Author_Lists/SOM_Faculty_short.tsv' #School of Medicine Faculty, cut short for testing university = 'University of Pittsburgh' university = get_university(university) Author_strings, Authors = get_name_list(file_location) Author_keys = [] for Author in Authors: full_name = Author[0] + ' ' + Author[1] Author_keys.append(full_name) print 'author keys', Author_keys frequency_values = [] for num, author_string in enumerate(Author_strings): indiv_dict = {} tot_authors = len(Author_strings) current_author_num = int(num) + 1 Author_first_last = Authors[num] Author = Author_first_last[0] + ' ' + Author_first_last[1] docs_dict = get_info_from_PubMed(articles, university, author_string) entity_frequency_list, tfidf_lists = get_genes.main( docs_dict, Author, current_author_num, tot_authors) frequency_values.append(entity_frequency_list) indiv_dict[Author_keys[num]] = entity_frequency_list base_name = os.path.basename(file_location) output_file = '/home/adam/workspace/TEES/text_files/Author_Lists/output/{0}'.format( base_name) with open(output_file, 'a') as f: # Just use 'w' mode in 3.x w = csv.writer(f) w.writerows(indiv_dict.items()) tfidf_output_file = output_file + '_tfidf' with open(tfidf_output_file, 'a') as f2: pickle.dump(tfidf_lists, f2) Author_frequency_dict = dict(zip(Author_keys, frequency_values)) # Author_frequency_dict = {'Kirill Kiselyov': [('TRP', 0.148148), ('TRPC', 0.138889), ('TRPML1', 0.12963), ('ML1', 0.092593), ('TRPC2', 0.083333), ('GPCR', 0.027778), ('CatB', 0.027778), ('TRP family', 0.027778), ('TRPML1 KD', 0.009259), ('Ca(2+) release', 0.009259), ('EBP50', 0.009259), ('PLCgamma', 0.009259), ('plasma membrane receptors', 0.009259), ('trp2 mutant', 0.009259), ('proapoptotic protein Bax', 0.009259), ('phospholipase C', 0.009259), ('lysosomal SNARE proteins', 0.009259), ('lysosomal ion homeostasis', 0.009259), ('calmodulin', 0.009259), ('IP(3) receptors', 0.009259), ('mitochondrial Ca2+', 0.009259), ('G protein-coupled receptors', 0.009259), ('InaD', 0.009259), ('synaptotagmin VII', 0.009259), ('VAMP7 KD', 0.009259), ('Ca(2+) release channels', 0.009259), ('caveolin', 0.009259), ('lysosomal enzymes', 0.009259), ('tyrosine kinase receptors', 0.009259), ('caspase', 0.009259), ('TRP2', 0.009259), ('MCOLN1', 0.009259), ('scaffolding proteins', 0.009259), ('lysosomal protease cathepsin B', 0.009259), ('TRPML3', 0.009259), ('transient receptor potential', 0.009259), ('G protein coupled receptors', 0.009259), ('TRPML2', 0.009259), ('VAMP7', 0.009259), ('SYT7', 0.009259), ('Zn(2+) transporters', 0.009259), ('apolipoprotein B hydrolysis in MLIV', 0.009259), ('NEHRF', 0.009259)], 'Madhavi Ganapathiraju': [('ANKLE1', 0.25), ('ORAOV1', 0.25), ('TMEM45B', 0.25), ('human protein', 0.25)]} print Author_frequency_dict