def update_metabolites(db):
    """
    Find metabolites mentioned in new articles, and insert new records into the
    metabolite_abstract table in the database.
    
    (For each metabolite in the metabolite_info.txt file, search against the 
    temporary whoosh index containing only new articles.)
    """

    logger.debug('Scanning for metabolites')

    # Don't open the index until this enclosing function is called, because
    # we'll be deleting it and re-creating it in a previous state of the 
    # update process.
    ix = open_index(TEMP_METABOLITE_INDEX_PATH)
    cursor = getcursor(db)


    # query parser and searcher
    parser = QueryParser('abstract',ix.schema)
    parser.add_plugin(PhrasePlugin)
    searcher = ix.searcher(weighting=BM25F)


    #Get all common names so they don't repeat
    #outfile = open('metabolite2pubmed.txt','w') #mapping file
    common_name_set = set()
    with open('metabolite_info.txt')as f:
        for line in f:
            if line.startswith('HMDB'):
                synonym_line=f.next().strip()
                synonyms = synonym_line.split('\t')
                common_name = synonyms[0]
                #print(common_name)
                common_name_set.add(common_name)


    #search abstracts and write to metabolite2pubmed.txt
    with open('metabolite_info.txt') as f:
        for line in f:
            if line.startswith('HMDB'):
                #outfile.write(line) #Write ID to file (line 1)
                
                hmdb_id = line.strip()
                
                synonym_line = f.next().strip()
                #outfile.write(synonym_line)
                synonyms = synonym_line.split('\t')
                common_name = synonyms[0]
                printsyn = common_name + '\t'
                for s in synonyms:
                    if s in common_name_set and s != common_name:
                        synonyms.remove(s)
                        continue
                    if s == common_name:
                        continue
                    printsyn = printsyn + '\t' +s
                #outfile.write(printsyn+'\n') #Write synonyms to file (line 2)
                reference_line = f.next().strip()
                references = set(reference_line.split('\t'))
                if '\n' in references:
                    references.remove('\n')

                for name in synonyms:
                    query = '"' + name + '"' #performs complete query
                    results = get_abstracts(parser, searcher, query) #searches with get_abstracts useing "line" as the search keyword
                    for item in results:
                        references.add(str(item))


                rlist = list(references)
                
                insert_db_records(cursor, hmdb_id, rlist)
                
                #rline = '\t'.join(references) + '\n'
                #outfile.write(rline) #Write references to file (line 3)


    logger.info('updated metabolite-abstract links')
Exemplo n.º 2
0
import buildindex
from config import ABSTRACT_INDEX_PATH

# set up logging
import logging
logger = logging.getLogger('GADGET.updater.mergeindex')

def merge(ix):
    """Merge all index segments for faster queries.
    This will take a long time."""

    logger.debug('merging index segments')
    writer = ix.writer()
    writer.commit(optimize=True)
    logger.info('merged index segments')


if __name__ == '__main__':
    
    try:
        ix = buildindex.open_index(ABSTRACT_INDEX_PATH)
    except Exception as e:
        logger.error('could not open index to merge.  Error message: %s', e)
        raise

    try:
        merge(ix)
    except Exception as e:
        logger.error('could not merge index.  Error message: %s', e)
        raise