Exemplo n.º 1
0
    for c in tqdm.tqdm(
            session.query(Candidate).filter(
                Candidate.id.in_(target_ids)).all()):
        f.write(c.get_parent().text + "\n")

# ### Generate Data to Train On

# In[ ]:

sql = '''
SELECT * from candidate
WHERE split = 0 and type='disease_gene'
ORDER BY RANDOM()
LIMIT 500000;
'''
target_cids = [x[0] for x in session.execute(sql)]

# In[ ]:

offset = 0
with open("data/doc2vec/train_data_500k.txt", "w") as f:
    while True:
        cands = session.query(Candidate).filter(
            Candidate.id.in_(target_cids)).offset(offset).limit(50000).all()

        if len(cands) == 0:
            break

        for c in tqdm.tqdm(cands):
            f.write(c.get_parent().text + "\n")
Exemplo n.º 2
0
GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2'])
gge = PretaggedCandidateExtractor(GeneGene, ['Gene', 'Gene'])

CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene'])
cge = PretaggedCandidateExtractor(CompoundGene, ['Compound', 'Gene'])

CompoundDisease = candidate_subclass('CompoundDisease',
                                     ['Compound', 'Disease'])
cde = PretaggedCandidateExtractor(CompoundDisease, ['Compound', 'Disease'])

# In[ ]:

# set the seed for reproduction
np.random.seed(100)
total_sentences = session.execute(
    "select count(*) from sentence").fetchone()[0]

# In[ ]:

category_list = np.random.choice([0, 1, 2], total_sentences, p=[0.7, 0.2, 0.1])

# In[ ]:

# Divide the sentences into train, dev and test sets

#Grab the sentences!!!
train_sens = set()
dev_sens = set()
test_sens = set()

offset = 0
Exemplo n.º 3
0
    LIMIT 10;
    '''
]

spreadsheet_names = {
    'train': 'data/compound_disease/sentence_labels_train.xlsx',
    'train_hand_label': 'data/compound_disease/sentence_labels_train_dev.xlsx',
    'dev': 'data/compound_disease/sentence_labels_dev.xlsx'
}


# In[ ]:


for sql, spreadsheet_name in zip(sql_statements, spreadsheet_names.values()):
    target_cids = [x[0] for x in session.execute(sql)]
    candidates = (
        session
        .query(CandidateClass)
        .filter(CandidateClass.id.in_(target_cids))
        .all()
    )
    candidate_df = make_sentence_df(candidates)
    write_candidates_to_excel(candidate_df, spreadsheet_name)


# # Develop Label Functions

# ## Look at potential Candidates

# Use this to look at loaded candidates from a given set. The constants represent the index to retrieve the appropiate set. Ideally, here is where one can look at a subset of the candidate and develop label functions for candidate labeling.
    ORDER BY RANDOM()
    LIMIT 50000;
    ''',
    
    '''
    SELECT id from candidate
    WHERE split = 1 and type='disease_gene'
    ORDER BY RANDOM()
    LIMIT 10000;
    ''',

    '''
    SELECT id from candidate
    WHERE split = 2 and type='disease_gene'
    ORDER BY RANDOM()
    LIMIT 10000;
    '''
]

#Exectue the queries and output them to excel files.
session.execute("SELECT setseed(0.5);")
for sql, spreadsheet_name in zip(sql_statements, spreadsheet_names.values()):
    target_cids = [x[0] for x in session.execute(sql)]
    candidates = (
        session
        .query(Candidate)
        .filter(Candidate.id.in_(target_cids))
        .all()
    )
    candidate_df = make_sentence_df(candidates)
    write_candidates_to_excel(candidate_df, spreadsheet_name)