Пример #1
0
def tag_sentences():
    '''
    input: train_sentences.json
    output: train_tag_sentences.json
    Tags the chemicals in each sentence using ChemicalTagger 
    (http://chemicaltagger.ch.cam.ac.uk/). This method communicates with 
    ChemicalTagger through a custom REST API running on pathway.berkeley.edu
    '''
    sentences = json.load(open('../data/train_sentences.json'))
    bar, i = pbar(len(sentences)), 0
    print 'Tagging chemicals in sentences'
    bar.start()
    chemicals = {}
    for sid, sentence in sentences.iteritems():
        chems = chemtagger.get_compounds(sid, sentence)
        if chems:
            chemicals[sid] = chems
        i += 1
        bar.update(i)
    bar.finish()
    json.dump(chemicals, open('../data/train_tag_sentences.json', 'wb'),
              indent=2, sort_keys=True)
    print 'Result dumped to ../data/train_tag_sentences.json'
Пример #2
0
def extract(sid, sentence):
    reactants = []
    chemicals = chemtagger.get_compounds(sid, sentence)
    if chemicals is None:
        return reactants
    chemicals = sanitize_chemicals(chemicals)
    chems = [y for x in chemicals for y in x.split()]
    stemmed_sentence = ' '.join([x if x in chems else STEMMER.stem(x)
                                 for x in sentence.split()])
    tagged_sentence = ' %s ' % stemmed_sentence
    for chem in sorted(chemicals, key=len, reverse=True):
        tagged_sentence = tagged_sentence.replace(
            ' %s ' % chem, ' $%s$chem ' % chem)
    tagged_sentence = tagged_sentence.strip()
    grouped_sentence = group_list(tagged_sentence)
    for pattern_id in expand_patterns():
        for pattern in expand_patterns()[pattern_id]:
            groups = pattern.findall(grouped_sentence, overlapped=True)
            matches = expand_chems(groups)
            for match in matches:
                if match and len(match) > 1:
                    reactants.append((pattern_id, match))
    return reactants