示例#1
0
    def setUpClass(cls):
        # This is a hack to create a session to a different DB after Snorkel has
        # already been imported. It does not work in general because e.g., the UDF
        # constructor uses Snorkel's new_sessionmaker on different processes.
        # In general, the connection should still be set via the SNORKELDB
        # environment variable
        dir_path = os.path.dirname(os.path.realpath(__file__))
        snorkel_engine = create_engine(os.path.join('sqlite:///' + dir_path, 'spouses.db'))
        SnorkelSession = sessionmaker(bind=snorkel_engine)
        cls.session = SnorkelSession()

        Spouse = candidate_subclass('Spouse', ['person1', 'person2'])

        cls.train_marginals = load_marginals(cls.session, split=0)

        cls.train_cands = cls.session.query(Spouse).filter(Spouse.split == 0).order_by(Spouse.id).all()
        cls.dev_cands   = cls.session.query(Spouse).filter(Spouse.split == 1).order_by(Spouse.id).all()
        cls.test_cands  = cls.session.query(Spouse).filter(Spouse.split == 2).order_by(Spouse.id).all()

        # Each candidate is featurized as 10 floats. The first five are between
        # -.25 and 1 if the class label is True and between -1 and .25 if False.
        # The remaining five are between -1 and 1.
        cls.F_train = load_feature_matrix(cls.session, split=0, coerce_int=False)
        cls.F_dev = load_feature_matrix(cls.session, split=1, coerce_int=False)
        cls.F_test = load_feature_matrix(cls.session, split=2, coerce_int=False)

        cls.L_gold_dev  = load_gold_labels(cls.session, annotator_name='gold', split=1)
        cls.L_gold_test = load_gold_labels(cls.session, annotator_name='gold', split=2)
示例#2
0
                  help="directory to save the rank scores [default: %default]")

parser.set_defaults(
    input_pair=
    "/Users/datienguyen/Desktop/coding/data-search/exp-data/dataSEARCH/pair-store/train.csv"  #3_signals.top20doc.csv" #
    ,
    saved_dir="../../data-search/exp-data/dataSearch/pair-store/")

opts, args = parser.parse_args(sys.argv)
input_pair = opts.input_pair

session.query(Context).delete()
session.query(Candidate).delete()

values = ['positive', 'negative']
Tweet = candidate_subclass('Tweet', ['tweet'], values=values)

#item_id,worker_id,query_id,doc1,doc2,annotation
cand_dict = {}

with open(input_pair, "r") as myFile:
    reader = csv.reader(myFile)
    for row in reader:
        #print(row)
        item_id = row[0]
        worker_id = row[1]
        anno = row[5]

        if item_id not in cand_dict:
            cand_dict[item_id] = {}
            cand_dict[item_id][worker_id] = anno
password = "******"
dbname = "pubmeddb"

#Path subject to change for different os
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()


# In[3]:


from snorkel.models import candidate_subclass, Candidate
CompoundDisease = candidate_subclass('CompoundDisease', ['Compound', 'Disease'])


# In[4]:


from utils.notebook_utils.dataframe_helper import write_candidates_to_excel, make_sentence_df


# ## Load and Merge DataFrames

# In[5]:


edge_level_df = (
    pd.read_table("input/compound_treats_disease.tsv.xz")
from snorkel.models import candidate_subclass

from .dictionaries import *
from .helper import make_regex

candidate_indices = [
    8, 14, 15, 20, 24
]  # Indices for reading the corresponding column in the "comments_bewertungen_new_ids.csv" File

# Candidates
Intelligenz_BB3c = candidate_subclass('Intelligenz', ['signal_word'])
WirkungaufBRD_BB3i = candidate_subclass('WirkungaufBRD', ['signal_word'])
Implikation_BB4 = candidate_subclass(
    'Implikation', ['signal_word']
)  # Gehören, sollten, etc. Abfrage mit Reflexiven Personalpronomen, Indefinite Pronomen, Imperative, kein Verb
Beschimpfung_BB6a = candidate_subclass('Beschimpfung', ['signal_word'])
Entmenschlichung_BB6e = candidate_subclass('Entmenschlichung', ['signal_word'])

# Regexes
intelligenz_regex = make_regex(intelligenz_signal_words)
wirkungaufbrd_regex = make_regex(wirkungaufbrd_signal_words)
implikation_regex = make_regex(implikation_signal_words)
beschimpfung_regex = make_regex(beschimpfung_signal_words,
                                offense_signal_words,
                                refugee_related_signal_words,
                                negative_signal_words)
entmenschlichung_regex = make_regex(entmenschlichung_signal_words,
                                    animal_signal_words)

candidate_classes = [
    Intelligenz_BB3c, WirkungaufBRD_BB3i, Implikation_BB4, Beschimpfung_BB6a,
示例#5
0
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(
    username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()

# In[3]:

from snorkel.learning.pytorch.rnn.rnn_base import mark_sentence
from snorkel.learning.pytorch.rnn.utils import candidate_to_tokens
from snorkel.models import Candidate, candidate_subclass

# In[4]:

CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene'])

# # Compound Binds Gene

# This section loads the dataframe that contains all compound binds gene candidate sentences and their respective dataset assignments.

# In[5]:

cutoff = 300
total_candidates_df = (
    pd.read_table("../dataset_statistics/data/all_cbg_candidates.tsv.xz"
                  ).query("sen_length < @cutoff"))
total_candidates_df.head(2)

# # Train Word Vectors
示例#6
0
from snorkel import SnorkelSession
session = SnorkelSession()


# In[3]:


from snorkel.learning.pytorch.rnn.utils import candidate_to_tokens
from snorkel.models import Candidate, candidate_subclass


# In[4]:


GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2'])


# In[5]:


total_candidates_df = (
    pd
    .read_table("input/all_gig_candidates.tsv.xz")
    .sort_values("candidate_id")
)
total_candidates_df.head(2)


# In[6]:
            crimetype = 'Drugs related Crime'
            break
    print(doc, date, crimetype)
    dict_final[doc] = {
        'docno': doc,
        'date': date,
        'crimetype': crimetype,
        'location': []
    }

# In[2]:

# In[7]:

from snorkel.models import candidate_subclass
LocationPer = candidate_subclass('LocationPer', ['location', 'person'])
# Location = candidate_subclass('Location', ['location'])

# In[8]:

from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import PersonMatcher, LocationMatcher

ngrams = Ngrams(n_max=3)
person_matcher = PersonMatcher(longest_match_only=True)
location_matcher = LocationMatcher(longest_match_only=True)
cand_extractor = CandidateExtractor(LocationPer, [ngrams, ngrams],
                                    [person_matcher, location_matcher],
                                    symmetric_relations=False)

# cand_extractor2 = CandidateExtractor(Location,
示例#8
0
#LFs=[LF_edit_index, LF_jackard_index, LF_Common_Tables_Index] # best so far
#LFs=[LF_edit_index,LF_jackard_index,LF_cosine_index,LF_Common_Tables_Index,LF_common_fragment_index]
#LFs=[LF_edit_index,LF_jackard_index,LF_cosine_index,LF_common_fragment_index]
#LFs = [LF_recall_projections2,  LF_recall_selections2,LF_recall_tables2, LF_edit_index, LF_jackard_index,LF_common_fragment_index, LF_Common_Tables_Index]
LFs = [LF_recall_projections2, LF_edit_index, LF_jackard_index]

##### snorkeling

session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor(path)

corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)

pairs = candidate_subclass('pairs1', ['queryPair'])
regexpmatch = RegexMatchSpan(rgx=".*")
cs = queryCandidate()
cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch])

docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)

sents = set()
for i, doc in enumerate(docs):
    for s in doc.sentences:
        sents.add(s)

cand_extractor.apply(sents)
示例#9
0
reg_param  = RangeParameter('mu', 1e-8, 1e-2, step=1, log_base=10)

disc_model = LogReg()

%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import CandidateSet
from snorkel.models import candidate_subclass
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
entity = candidate_subclass('entity', ['entity1', 'entity2'])
dev = session.query(CandidateSet).filter(CandidateSet.name == 'Protein1 Development Candidates').one()
%time F_dev = feature_manager.update(session, dev, 'Train1 Features', False)

from snorkel.annotations import LabelManager

label_manager = LabelManager()
L_gold_dev = label_manager.load(session, dev, "Sotera User")
gold_dev_set = session.query(CandidateSet).filter(CandidateSet.name == 'Protein Development Candidates').one()


from snorkel.learning import LogReg
from snorkel.learning_utils import RandomSearch, ListParameter, RangeParameter

iter_param = ListParameter('n_iter', [250, 500, 1000, 2000])
rate_param = RangeParameter('rate', 1e-4, 1e-2, step=0.75, log_base=10)
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.viewer import SentenceNgramViewer
from snorkel.annotations import LabelAnnotator, load_gold_labels, FeatureAnnotator, save_marginals, load_marginals
from snorkel.learning import SparseLogisticRegression, GenerativeModel, RandomSearch
from snorkel.learning.structure import DependencySelector
from snorkel.learning.utils import MentionScorer
# from snorkel.contrib.rnn import reRNN

import matchers
import LF
from candidate_adjective_fixer import *
from load_external_annotations_new import load_external_labels

session = SnorkelSession()

BiomarkerCondition = candidate_subclass('BiomarkerCondition',
                                        ['biomarker', 'condition'])

# Helper functions

# In[ ]:

#------------------
# Helper Functions
#------------------


def grabCandidates(extractor, schema):
    # Candidate Counts
    for k, sents in enumerate([train_sents, dev_sents, test_sents]):
        extractor.apply(sents, split=k, clear=False)
        print "Number of candidates: ", session.query(schema).filter(
示例#11
0
    %time corpus = cp.parse_corpus(session, name)
    session.commit()

sentences = set()
for document in corpus:
    for sentence in document.sentences:
        if number_of_people(sentence) < 5:
            sentences.add(sentence)





from snorkel.models import candidate_subclass

Title = candidate_subclass('Person_Org', ['person1', 'organization'])

from snorkel.candidates import Ngrams

ngrams = Ngrams(n_max=3)

from snorkel.matchers import PersonMatcher

from snorkel.matchers import OrganizationMatcher

person_matcher = PersonMatcher(longest_match_only=True)

org_matcher = OrganizationMatcher(longest_match_only=True)

from snorkel.candidates import CandidateExtractor
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams,
        cand1Matcher, cand2Matcher, model_name, output_file_name,
        corpus_parser):
    print "Started"
    session = SnorkelSession()

    # The following line is for testing only. Feel free to ignore it.

    candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2])

    sentences = set()
    docs = session.query(Document).order_by(Document.name).all()
    for doc in docs:
        for s in doc.sentences:
            sentences.add(s)

    cand_1_ngrams = Ngrams(n_max=cand1_ngrams)
    # condition_ngrams = Ngrams(n_max=7)
    cand_2_ngrams = Ngrams(n_max=cand2_ngrams)
    # medium_ngrams = Ngrams(n_max=5)
    # type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
    # # level_ngrams = Ngrams(n_max=1)
    # unit_ngrams = Ngrams(n_max=1)

    # Construct our Matchers

    # cMatcher = matchers.getConditionMatcher()
    # mMatcher = matchers.getMediumMatcher()
    # tMatcher = matchers.getTypeMatcher()
    # lMatcher = matchers.getLevelMatcher()
    # uMatcher = matchers.getUnitMatcher()

    # Building the CandidateExtractors
    # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])
    candidate_extractor = CandidateExtractor(candidate_pair,
                                             [cand_1_ngrams, cand_2_ngrams],
                                             [cand1Matcher, cand2Matcher])
    # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
    # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])
    # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])

    # List of Candidate Sets for each relation type: [train, dev, test]
    candidate_extractor.apply(sentences, split=4, clear=True)
    cands = session.query(candidate_pair).filter(
        candidate_pair.split == 4).order_by(candidate_pair.id).all()
    session.commit()
    # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
    # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
    # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
    # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit)

    if (len(cands)) == 0:
        print "No Candidates Found"
        return
    if (pairing_name == 'BiomarkerCondition'):
        # session.rollback()
        # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1])
        add_adj_candidate_BC(session, candidate_pair, cands, 4)
        # fix_specificity(session, BiomarkerCondition, cands_BC[1])
        # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count()
        session.commit()

    lstm = reRNN(seed=1701, n_threads=None)

    lstm.load(model_name)

    predictions = lstm.predictions(cands)
    output_file = open(output_file_name, 'wb')
    import csv
    csvWriter = csv.writer(output_file)
    csvWriter.writerow(
        ['doc_id', 'sentence', candidate1, candidate2, 'prediction'])
    for i in range(len(cands)):
        doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:]
        sentence_string = cands[i].get_parent().text
        cand_1_string = cands[i].get_contexts()[0].get_span()
        cand_2_string = cands[i].get_contexts()[1].get_span()
        prediction = predictions[i]
        csvWriter.writerow([
            unidecode(doc_string),
            unidecode(sentence_string),
            unidecode(cand_1_string),
            unidecode(cand_2_string), prediction
        ])
示例#13
0
sys.path.insert(1, '../snorkel')

from snorkel import SnorkelSession
from snorkel.matchers import DictionaryMatch
from final_candidates import GM, PM
from snorkel.candidates import Ngrams, CandidateSpace, CandidateExtractor
from snorkel.models import Document, Sentence, candidate_subclass
from snorkel.viewer import SentenceNgramViewer

SPLIT_ON_DOCS = False
ALL_DOCS = True  # if true, create train dev and test. if false, push everything to dev cands.

session = SnorkelSession()

GenePhenoPair = candidate_subclass('GenePhenoPair2', ['gene', 'pheno'])

gene_ngrams = Ngrams(n_max=5)
pheno_ngrams = Ngrams(n_max=10)
cand_extractor = CandidateExtractor(GenePhenoPair, [gene_ngrams, pheno_ngrams],
                                    [GM, PM],
                                    symmetric_relations=True)

print "Splitting Docs..."
pathname = 'small_data/' if os.environ[
    'AGP_DATA_SIZE'] == 'small-data' else 'data/'
with open(pathname + 'pmcids_400.pkl', 'rb') as f:
    sent_dicts = cPickle.load(f)
train_ids, dev_ids, test_ids = set(sent_dicts['train']), set(
    sent_dicts['dev']), set(sent_dicts['test'])
all_ids = train_ids.union(dev_ids).union(test_ids)
    rule_regex_search_before_A,
    rule_regex_search_before_B,
)


# A ContextSpace defines the "space" of all candidates we even potentially consider; in this case we use the Ngrams subclass, and look for all n-grams up to 7 words long

session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor('/Users/fanglinchen/Desktop/PersonalDataStack/DeepScrub/DeepScrub/algorithms/input.tsv', max_docs=350) 
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)

Sensitive = candidate_subclass('Sensitive', ['sensitive'], values = ['person', 'job', 'event', 
                                                                    'place', 'date', 'time', 
                                                                    'product', 'email', 'phone', 
                                                                    'quantity', 'address', 'url', 
                                                                    'org', 'file', 'password', False])
# generating candidates. 
ngrams = Ngrams(n_max=6)
ngramMatcher = NgramMatcher(longest_match_only = False)


cand_extractor = CandidateExtractor(
    Sensitive, 
    [ngrams],
    [ngramMatcher],
    symmetric_relations=False
)
sents = session.query(Sentence).all()
cand_extractor.apply(sents, split=0)
示例#15
0
experiment_name = '_exp3'
# experiment_name = '25similar'
print "Setting up variables & DB connection for experiment:\n"
print "*******************\n%s\n*******************" % experiment_name

# point to appropriate DBs, pickle files etc.
pkl_paths = 'pickles/%s/' % experiment_name
path_candidate_dict_pkl = pkl_paths + 'candidate_dict.pickle'  # TODO rename that
path_pubmed_ids_pkl = pkl_paths + 'pubmed_ids.pickle'
path_base_learners = pkl_paths + 'base_learner_predictions'  # TODO create dirs

# Shortcuts to connect to database, initialize candidate subclass and return snorkel session
import os
#TODO: set experiment_name and restructure dir
os.environ['SNORKELDB'] = 'postgres:///snorkel' + experiment_name

from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import Document, Sentence
import matplotlib.pyplot as plt
from snorkel.annotations import save_marginals
from snorkel.models import Candidate, candidate_subclass
REGULATOR = candidate_subclass('REGULATOR', ['Chemical', 'Gene'])

print "\nSnorkel session connected to: ", os.environ['SNORKELDB']
示例#16
0
        virus_list.remove(word)

# ------------------------------------------

# START SNORKEL SESSION

session = SnorkelSession()

n_docs = 500

doc_preprocessor = TSVDocPreprocessor(
    'pdfs_big.tsv', max_docs=n_docs)  # new files (88 papers)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor, count=n_docs)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d=virus_list)
animals_matcher = DictionaryMatch(d=animals_list)
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [
                                    virus_matcher, animals_matcher], nested_relations=True)

docs = session.query(Document).order_by(Document.name).all()

# Text Pattern based labeling functions, which look for certain keywords

# List to parenthetical


def ltp(x):
from snorkel.annotations import FeatureAnnotator, LabelAnnotator, save_marginals
from snorkel.learning import GenerativeModel
from snorkel.learning.utils import MentionScorer
from snorkel.models import Candidate, FeatureKey, candidate_subclass
from snorkel.utils import get_as_dict
from tree_structs import corenlp_to_xmltree
from treedlib import compile_relation_feature_generator

# In[ ]:

edge_type = "dg"

# In[ ]:

if edge_type == "dg":
    DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])
elif edge_type == "gg":
    GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2'])
elif edge_type == "cg":
    CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene'])
elif edge_type == "cd":
    CompoundDisease = candidate_subclass('CompoundDisease',
                                         ['Compound', 'Disease'])
else:
    print("Please pick a valid edge type")

# # Load preprocessed data

# This code will load the label matrix that was generated in the previous notebook ([Notebook 2](2.data-labeler.ipynb)). **Disclaimer**: this block might break, which means that the snorkel code is still using its old code. The problem with the old code is that sqlalchemy will attempt to load all the labels into memory. Doesn't sound bad if you keep the amount of labels small, but doesn't scale when the amount of labels increases exponentially. Good news is that there is a pull request to fix this issue. [Check it out here!](https://github.com/HazyResearch/snorkel/pull/789)

# In[ ]:
示例#18
0
docs_per_bucket = args.docs_per_bucket
sents_split = defaultdict(lambda: [])
for ind, doc in enumerate(docs):
    bucket = int(ind / docs_per_bucket)
    for s in doc.sentences:
        sents_split[bucket] += [s]
print("Number of buckets: (should have around ~100 buckets??)",
      len(sents_split))

from snorkel.models import candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import *
import datetime

Unigram = candidate_subclass('Unigram', ['unigram_cue'],
                             values=['PP', 'MN', 'NULL'])
ngrams = Ngrams(n_max=1)
ngram_matcher = NgramMatcher()
unigram_segment_extractor = CandidateExtractor(Unigram, [ngrams],
                                               [ngram_matcher])

# from snorkel.lf_helpers import *
from snorkel.annotations import LabelAnnotator

# from LF.util_common_default_categorical import purpose_LFs,mechanism_LFs,null_LFs
from LF.util_common_default_categorical_onset_1026 import *

# purpose_LFs,mechanism_LFs,null_LFs
print("total LF count",
      len(purpose_LFs + mechanism_LFs + null_LFs), "unique count",
      len(set(purpose_LFs + mechanism_LFs + null_LFs)), "purpose_LFs",
示例#19
0
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(
    username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()

# In[3]:

from snorkel.learning.pytorch.rnn.rnn_base import mark_sentence
from snorkel.learning.pytorch.rnn.utils import candidate_to_tokens
from snorkel.models import Candidate, candidate_subclass

# In[4]:

DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

# # Disease Associates Disease

# This section loads the dataframe that contains all disease associates gene candidate sentences and their respective dataset assignments.

# In[5]:

cutoff = 300
total_candidates_df = (
    pd.read_table("../dataset_statistics/data/all_dg_candidates_map.tsv.xz"
                  ).query("sen_length < @cutoff"))
total_candidates_df.head(2)

# # Embed All Disease Gene Sentences
示例#20
0
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(
                doc.name))

#----------------------
# Candidate Extraction
#----------------------

# Defining the Candidate Schemas
BiomarkerCondition = candidate_subclass('BiomarkerCondition',
                                        ['biomarker', 'condition'])
BiomarkerDrug = candidate_subclass('BiomarkerDrug', ['biomarker', 'drug'])
BiomarkerMedium = candidate_subclass('BiomarkerMedium',
                                     ['biomarker', 'medium'])

# N-grams: the probabilistic search space of our entities
biomarker_ngrams = Ngrams(n_max=1)
condition_ngrams = Ngrams(n_max=7)
drug_ngrams = Ngrams(n_max=5)
medium_ngrams = Ngrams(n_max=5)
type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?

# Construct our Matchers
bMatcher = matchers.getBiomarkerMatcher()
cMatcher = matchers.getDiseaseMatcher()
dMatcher = matchers.getDrugMatcher()