def update_metabolites(db): """ Find metabolites mentioned in new articles, and insert new records into the metabolite_abstract table in the database. (For each metabolite in the metabolite_info.txt file, search against the temporary whoosh index containing only new articles.) """ logger.debug('Scanning for metabolites') # Don't open the index until this enclosing function is called, because # we'll be deleting it and re-creating it in a previous state of the # update process. ix = open_index(TEMP_METABOLITE_INDEX_PATH) cursor = getcursor(db) # query parser and searcher parser = QueryParser('abstract',ix.schema) parser.add_plugin(PhrasePlugin) searcher = ix.searcher(weighting=BM25F) #Get all common names so they don't repeat #outfile = open('metabolite2pubmed.txt','w') #mapping file common_name_set = set() with open('metabolite_info.txt')as f: for line in f: if line.startswith('HMDB'): synonym_line=f.next().strip() synonyms = synonym_line.split('\t') common_name = synonyms[0] #print(common_name) common_name_set.add(common_name) #search abstracts and write to metabolite2pubmed.txt with open('metabolite_info.txt') as f: for line in f: if line.startswith('HMDB'): #outfile.write(line) #Write ID to file (line 1) hmdb_id = line.strip() synonym_line = f.next().strip() #outfile.write(synonym_line) synonyms = synonym_line.split('\t') common_name = synonyms[0] printsyn = common_name + '\t' for s in synonyms: if s in common_name_set and s != common_name: synonyms.remove(s) continue if s == common_name: continue printsyn = printsyn + '\t' +s #outfile.write(printsyn+'\n') #Write synonyms to file (line 2) reference_line = f.next().strip() references = set(reference_line.split('\t')) if '\n' in references: references.remove('\n') for name in synonyms: query = '"' + name + '"' #performs complete query results = get_abstracts(parser, searcher, query) #searches with get_abstracts useing "line" as the search keyword for item in results: references.add(str(item)) rlist = list(references) insert_db_records(cursor, hmdb_id, rlist) #rline = '\t'.join(references) + '\n' #outfile.write(rline) #Write references to file (line 3) logger.info('updated metabolite-abstract links')
import buildindex from config import ABSTRACT_INDEX_PATH # set up logging import logging logger = logging.getLogger('GADGET.updater.mergeindex') def merge(ix): """Merge all index segments for faster queries. This will take a long time.""" logger.debug('merging index segments') writer = ix.writer() writer.commit(optimize=True) logger.info('merged index segments') if __name__ == '__main__': try: ix = buildindex.open_index(ABSTRACT_INDEX_PATH) except Exception as e: logger.error('could not open index to merge. Error message: %s', e) raise try: merge(ix) except Exception as e: logger.error('could not merge index. Error message: %s', e) raise