예제 #1
0
def main():
    # Load the dictionaries we need
    stopwords_dict = load_dict("stopwords")
    hpoterm_phenotype_abnormalities = load_dict(
        "hpoterm_phenotype_abnormalities")
    # Load the stemmer from NLTK
    stemmer = SnowballStemmer("english")
    if len(sys.argv) != 2:
        sys.stderr.write("USAGE: {} DICT\n".format(sys.argv[0]))
        sys.exit(1)
    with open(sys.argv[1], 'rt') as dict_file:
        for line in dict_file:
            # Skip empty lines
            if line.strip() == "":
                continue
            hpo_id, name, definition = line.strip().split("\t")
            # Skip if this is not a phenotypic abnormality
            if hpo_id not in hpoterm_phenotype_abnormalities:
                continue
            tokens = name.split()
            if len(tokens) == 1:
                name_stems = [
                    tokens[0].casefold(),
                ]
            else:
                # Compute the stems of the name
                name_stems = set()
                for word in tokens:
                    # Remove parenthesis and commas and colons
                    if word[0] == "(":
                        word = word[1:]
                    if word[-1] == ")":
                        word = word[:-1]
                    if word[-1] == ",":
                        word = word[:-1]
                    if word[-1] == ":":
                        word = word[:-1]
                    # Only process non stop-words AND single letters
                    if (word.casefold() not in stopwords_dict
                            and word not in ORDINALS) or len(word) == 1:
                        # split words that contain a "/"
                        if word.find("/") != -1:
                            for part in word.split("/"):
                                name_stems.add(stemmer.stem(part))
                        else:
                            name_stems.add(stemmer.stem(word))
            print("\t".join([hpo_id, name, "|".join(name_stems)]))
예제 #2
0
def main():
    # Load the dictionaries we need
    stopwords_dict = load_dict("stopwords")
    hpoterm_phenotype_abnormalities = load_dict("hpoterm_phenotype_abnormalities")
    # Load the stemmer from NLTK
    stemmer = SnowballStemmer("english")
    if len(sys.argv) != 2:
        sys.stderr.write("USAGE: {} DICT\n".format(sys.argv[0]))
        sys.exit(1)
    with open(sys.argv[1], "rt") as dict_file:
        for line in dict_file:
            # Skip empty lines
            if line.strip() == "":
                continue
            hpo_id, name, definition = line.strip().split("\t")
            # Skip if this is not a phenotypic abnormality
            if hpo_id not in hpoterm_phenotype_abnormalities:
                continue
            tokens = name.split()
            if len(tokens) == 1:
                name_stems = [tokens[0].casefold()]
            else:
                # Compute the stems of the name
                name_stems = set()
                for word in tokens:
                    # Remove parenthesis and commas and colons
                    if word[0] == "(":
                        word = word[1:]
                    if word[-1] == ")":
                        word = word[:-1]
                    if word[-1] == ",":
                        word = word[:-1]
                    if word[-1] == ":":
                        word = word[:-1]
                    # Only process non stop-words AND single letters
                    if (word.casefold() not in stopwords_dict and word not in ORDINALS) or len(word) == 1:
                        # split words that contain a "/"
                        if word.find("/") != -1:
                            for part in word.split("/"):
                                name_stems.add(stemmer.stem(part))
                        else:
                            name_stems.add(stemmer.stem(word))
            print("\t".join([hpo_id, name, "|".join(name_stems)]))
예제 #3
0
#! /usr/bin/env python3

from helper.dictionaries import load_dict

if __name__ == "__main__":
    merged_genes_dict = load_dict("merged_genes")
    inverted_long_names = load_dict("inverted_long_names")
    hpoterms_orig = load_dict("hpoterms_orig")

    for long_name in inverted_long_names:
        for hpoterm_name in hpoterms_orig:
            if long_name in hpoterm_name.split() and \
                    long_name.casefold() != hpoterm_name.casefold():
                print("\t".join((long_name, hpoterm_name)))
예제 #4
0
GENE_KWS = frozenset([
    "gene", "oncogene", "protooncogene", "proto-oncogene", "pseudogene",
    "transgene"])

COEXPRESSION_KWS = frozenset([
    "expression", "overexpression", "over-expression", "co-expression",
    "coexpression"])


KEYWORDS = VAR_KWS | KNOCK_KWS | AMINO_ACID_KWS | ANTIGENE_KWS | DNA_KWS | \
    DOWNREGULATION_KWS | DOWNREGULATION_KWS | TUMOR_KWS | GENE_KWS | \
    COEXPRESSION_KWS

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
long_names_dict = load_dict("long_names")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")
stopwords_dict = load_dict("stopwords")


# Add features to a gene mention candidate
def add_features(mention_id, mention_words, sentence):
    # The verb closest to the candidate, with the path to it.
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \
    "gene", "oncogene", "protooncogene", "proto-oncogene", "pseudogene",
    "transgene"
])

COEXPRESSION_KWS = frozenset([
    "expression", "overexpression", "over-expression", "co-expression",
    "coexpression"
])


KEYWORDS = VAR_KWS | KNOCK_KWS | AMINO_ACID_KWS | ANTIGENE_KWS | DNA_KWS | \
    DOWNREGULATION_KWS | DOWNREGULATION_KWS | TUMOR_KWS | GENE_KWS | \
    COEXPRESSION_KWS

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
long_names_dict = load_dict("long_names")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")
stopwords_dict = load_dict("stopwords")


# Add features to a gene mention candidate
def add_features(mention_id, mention_words, sentence):
    # The verb closest to the candidate, with the path to it.
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \
예제 #6
0
#    "EXT_KEYWORD_MIN_[activation]nn@",
#    "EXT_KEYWORD_MIN_[oligomerization]nn@",
#    "EXT_KEYWORD_MIN_[methylation]prep_of@",
#    "EXT_KEYWORD_MIN_[antibody]nn@",
#    "EXT_KEYWORD_MIN_[polymorphism]prep_of@",
#    "EXT_KEYWORD_MIN_[gene]appos@",
#    "EXT_KEYWORD_MIN_[enzyme]@nn",
#    "EXT_KEYWORD_MIN_[phosphorylation]prep_of@",
#    "EXT_KEYWORD_MIN_[receptor]@nn",
#    "EXT_KEYWORD_MIN_[histone]@nn",
#    "EXT_KEYWORD_MIN_[receptor]nn",
#    "IS_LONG_ALPHANUMERIC_MAIN_SYMBOL", "IS_HYPHENATED_SYMBOL", "IS_LONG_NAME"
#    ])

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
english_dict = load_dict("english")
stopwords_dict = load_dict("stopwords")
pos_mentions_dict = load_dict("pos_gene_mentions")
neg_mentions_dict = load_dict("neg_gene_mentions")
med_acrons_dict = load_dict("med_acrons")
long_names_dict = load_dict("long_names")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")

# Max mention length. We won't look at subsentences longer than this.
max_mention_length = 0
for key in merged_genes_dict:
    length = len(key.split())
    if length > max_mention_length:
        max_mention_length = length
        if frozenset([gene_mention.words[0].word, hpo_entity_id]) in \
                genehpoterms_dict:
            in_mapping = True
        else:
            for gene in gene_mention.entity.split("|"):
                if frozenset([gene, hpo_entity_id]) in \
                        genehpoterms_dict:
                    in_mapping = True
                    break
        if in_mapping:
            relation.is_correct = True
            relation.type = "GENEHPOTERM_SUP_MAP"


# Load the gene<->hpoterm dictionary
genehpoterms_dict = load_dict("genehpoterms")

if __name__ == "__main__":
    # Process input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "poses",
                       "ners", "lemmas", "dep_paths", "dep_parents",
                       "bounding_boxes", "gene_entities", "gene_wordidxss",
                       "gene_is_corrects", "gene_types",
                       "hpoterm_entities", "hpoterm_wordidxss",
                       "hpoterm_is_corrects", "hpoterm_types"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                 TSVstring2list, TSVstring2list, TSVstring2list,
예제 #8
0
#    "EXT_KEYWORD_MIN_[activation]nn@",
#    "EXT_KEYWORD_MIN_[oligomerization]nn@",
#    "EXT_KEYWORD_MIN_[methylation]prep_of@",
#    "EXT_KEYWORD_MIN_[antibody]nn@",
#    "EXT_KEYWORD_MIN_[polymorphism]prep_of@",
#    "EXT_KEYWORD_MIN_[gene]appos@",
#    "EXT_KEYWORD_MIN_[enzyme]@nn",
#    "EXT_KEYWORD_MIN_[phosphorylation]prep_of@",
#    "EXT_KEYWORD_MIN_[receptor]@nn",
#    "EXT_KEYWORD_MIN_[histone]@nn",
#    "EXT_KEYWORD_MIN_[receptor]nn",
#    "IS_LONG_ALPHANUMERIC_MAIN_SYMBOL", "IS_HYPHENATED_SYMBOL", "IS_LONG_NAME"
#    ])

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
english_dict = load_dict("english")
stopwords_dict = load_dict("stopwords")
pos_mentions_dict = load_dict("pos_gene_mentions")
neg_mentions_dict = load_dict("neg_gene_mentions")
med_acrons_dict = load_dict("med_acrons")
long_names_dict = load_dict("long_names")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")

# Max mention length. We won't look at subsentences longer than this.
max_mention_length = 0
for key in merged_genes_dict:
    length = len(key.split())
    if length > max_mention_length:
        max_mention_length = length
                            is_definition = False
                            break
                    definition = " ".join([w.word for w in window_words])
                    # Only consider this acronym if the definition is valid
                    if is_definition:
                        acronym = dict()
                        acronym["acronym"] = word.word
                        acronym["definition"] = definition
                        acronyms.append(acronym)
                        break
                    start_idx += 1
    return acronyms


# Load the genes dictionary
merged_genes_dict = load_dict("merged_genes")
inverted_long_names = load_dict("inverted_long_names")

if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line,
                ["doc_id", "sent_ids", "wordidxss", "wordss", "posess", 
                    "nerss", "lemmass", "dep_pathss", "dep_parentss",
                    "bounding_boxess"],
                [no_op, lambda x: TSVstring2list(x, int), 
                    lambda x: TSVstring2list(x,sep='!~!'), 
                    lambda x: TSVstring2list(x,sep='!~!'), 
예제 #10
0
    "cronic", "deletion", "detection", "diagnose", "diagnosis", "disease",
    "drug", "family", "gene", "genome", "genomic", "genotype", "give", "grade",
    "group", "history", "infection", "inflammatory", "injury", "mutation",
    "pathway", "phenotype", "polymorphism", "prevalence", "protein", "risk",
    "severe", "stage", "symptom", "syndrome", "therapy", "therapeutic",
    "treat", "treatment", "variant"
    "viruses", "virus"
])

PATIENT_KWS = frozenset(
    ["boy", "girl", "man", "woman", "men", "women", "patient", "patients"])

KEYWORDS = VAR_KWS | PATIENT_KWS

# Load the dictionaries that we need
english_dict = load_dict("english")
stopwords_dict = load_dict("stopwords")
inverted_hpoterms = load_dict("hpoterms_inverted")
hponames_to_ids = load_dict("hponames_to_ids")
genes_with_hpoterm = load_dict("genes_with_hpoterm")
# hpodag = load_dict("hpoparents")

stems = set()
for hpo_name in inverted_hpoterms:
    stem_set = inverted_hpoterms[hpo_name]
    stems |= stem_set
stems = frozenset(stems)

# The keys of the following dictionary are sets of stems, and the values are
# sets of hpoterms whose name, without stopwords, gives origin to the
# corresponding set of stems (as key)
예제 #11
0
        if frozenset([gene_mention.words[0].word, hpo_entity_id]) in \
                genehpoterms_dict:
            in_mapping = True
        else:
            for gene in gene_mention.entity.split("|"):
                if frozenset([gene, hpo_entity_id]) in \
                        genehpoterms_dict:
                    in_mapping = True
                    break
        if in_mapping:
            relation.is_correct = True
            relation.type = "GENEHPOTERM_SUP_MAP"


# Load the gene<->hpoterm dictionary
genehpoterms_dict = load_dict("genehpoterms")

if __name__ == "__main__":
    # Process input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line,
                [
                    "doc_id", "sent_id", "wordidxs", "words", "poses", "ners",
                    "lemmas", "dep_paths", "dep_parents", "bounding_boxes",
                    "gene_entities", "gene_wordidxss", "gene_is_corrects",
                    "gene_types", "hpoterm_entities", "hpoterm_wordidxss",
                    "hpoterm_is_corrects", "hpoterm_types"
                ],
예제 #12
0
#! /usr/bin/env python3
#
# Extract, add features to, and supervise mentions extracted from geneRifs.
#

import fileinput

from dstruct.Sentence import Sentence
from extract_gene_mentions import extract
from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op
from helper.dictionaries import load_dict

if __name__ == "__main__":
    # Load the merged genes dictionary
    merged_genes_dict = load_dict("merged_genes")
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "gene"], [
                    no_op, int, lambda x: TSVstring2list(x, int),
                    TSVstring2list, no_op
                ])
            # Create the Sentence object
            null_list = [
                None,
            ] * len(line_dict["wordidxs"])
            sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                                line_dict["wordidxs"], line_dict["words"],
                                null_list, null_list, null_list, null_list,
from dstruct.Mention import Mention
from dstruct.Sentence import Sentence
from helper.dictionaries import load_dict
from helper.easierlife import get_all_phrases_in_sentence, \
    get_dict_from_TSVline, TSVstring2list, no_op

DOC_ELEMENTS = frozenset(
    ["figure", "table", "figures", "tables", "fig", "fig.", "figs", "figs.",
     "file", "movie"])

INDIVIDUALS = frozenset(["individual", "individuals", "patient"])

TYPES = frozenset(["group", "type", "class", "method"])

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")
english_dict = load_dict("english")

# Max mention length. We won't look at subsentences longer than this.
max_mention_length = 0
for key in merged_genes_dict:
    length = len(key.split())
    if length > max_mention_length:
        max_mention_length = length
# doubling to take into account commas and who knows what
max_mention_length *= 2


# Supervise the candidates.
예제 #14
0
                            is_definition = False
                            break
                    definition = " ".join([w.word for w in window_words])
                    # Only consider this acronym if the definition is valid
                    if is_definition:
                        acronym = dict()
                        acronym["acronym"] = word.word
                        acronym["definition"] = definition
                        acronyms.append(acronym)
                        break
                    start_idx += 1
    return acronyms


# Load the genes dictionary
merged_genes_dict = load_dict("merged_genes")
inverted_long_names = load_dict("inverted_long_names")

if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(line, [
                "doc_id", "sent_ids", "wordidxss", "wordss", "posess", "nerss",
                "lemmass", "dep_pathss", "dep_parentss", "bounding_boxess"
            ], [
                no_op, lambda x: TSVstring2list(x, int),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),
    "abnormality", "affect", "apoptosis", "association", "cancer", "carcinoma",
    "case", "cell", "chemotherapy", "clinic", "clinical", "chromosome",
    "cronic", "deletion", "detection", "diagnose", "diagnosis", "disease",
    "drug", "family", "gene", "genome", "genomic", "genotype", "give", "grade",
    "group", "history", "infection", "inflammatory", "injury", "mutation",
    "pathway", "phenotype", "polymorphism", "prevalence", "protein", "risk",
    "severe", "stage", "symptom", "syndrome", "therapy", "therapeutic",
    "treat", "treatment", "variant" "viruses", "virus"])

PATIENT_KWS = frozenset(
    ["boy", "girl", "man", "woman", "men", "women", "patient", "patients"])

KEYWORDS = VAR_KWS | PATIENT_KWS

# Load the dictionaries that we need
english_dict = load_dict("english")
stopwords_dict = load_dict("stopwords")
inverted_hpoterms = load_dict("hpoterms_inverted")
hponames_to_ids = load_dict("hponames_to_ids")
genes_with_hpoterm = load_dict("genes_with_hpoterm")
# hpodag = load_dict("hpoparents")


stems = set()
for hpo_name in inverted_hpoterms:
    stem_set = inverted_hpoterms[hpo_name]
    stems |= stem_set
stems = frozenset(stems)

# The keys of the following dictionary are sets of stems, and the values are
# sets of hpoterms whose name, without stopwords, gives origin to the
#! /usr/bin/env python3
#
# Extract, add features to, and supervise mentions extracted from geneRifs.
#

import fileinput

from dstruct.Sentence import Sentence
from extract_gene_mentions import extract
from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op
from helper.dictionaries import load_dict

if __name__ == "__main__":
    # Load the merged genes dictionary
    merged_genes_dict = load_dict("merged_genes")
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "gene"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                    no_op])
            # Create the Sentence object
            null_list = [None, ] * len(line_dict["wordidxs"])
            sentence = Sentence(
                line_dict["doc_id"], line_dict["sent_id"],
                line_dict["wordidxs"], line_dict["words"], null_list,
                null_list, null_list, null_list, null_list, null_list)
            # This is the 'labelled' gene that we know is in the sentence
            gene = line_dict["gene"]
예제 #17
0
#! /usr/bin/env python3
#
# Canonicalize a dump using the HPO dag
#
# Use the output of filter_out_uncertain_genes.py


import sys

from helper.dictionaries import load_dict

if len(sys.argv) != 2:
    sys.stderr.write("USAGE: {} dump.tsv\n".format(sys.argv[0]))
    sys.exit(1)

hpoancestors = load_dict("hpoancestors")

with open(sys.argv[1], 'rt') as dump:
    for line in dump:
        tokens = line.strip().split("\t")
        relation_id = tokens[0]
        gene_entity = tokens[1]
        hpo_entity = tokens[3]
        if "|" not in hpo_entity:
            continue
        hpo_id = hpo_entity.split("|")[0]
        if hpo_id not in hpoancestors:
            continue
        print("{}\t{}\t{}".format(relation_id, gene_entity, hpo_entity))
        for ancestor in hpoancestors[hpo_id]:
            print("{}\t{}\t{}".format(relation_id, gene_entity, ancestor))
예제 #18
0
threeh_suffixes = threeh_singles

full_singles = frozenset(("outcalls", "outcall", "overnite"))
full_suffixes = full_singles

two_grams_prefixes = frozenset((
    "1/2", "1/2", "half", "1.5", "hlf", "full", "an hour", "a hr", "whole", "multiple", "additional", "first", "multi", "add", "complete"))
two_grams_durations = frozenset((
    "quick visits", "quick visit", "quick fix", "short stay", "short visit", "quick stay", "short fix"))

singles = quick_singles | half_singles | hour_singles | fourtyfive_singles | \
    twoh_singles | threeh_singles
suffixes = quick_suffixes | half_suffixes | hour_suffixes | \
    fourtyfive_suffixes | twoh_suffixes | threeh_suffixes | min_suffixes

STOP_WORDS = load_dict("stopwords")
ALL_ENGLISH_WORDS = load_dict("english") - \
    (singles | suffixes | two_grams_prefixes | {"quick", })

sregex_1w_num_min = re.compile("^(15|20|30|45|60|90|120)/?-?min")
sregex_1w_num_hour = re.compile("^(1|2|3|one|two|three)/?-?h")

regex_min = re.compile("min$|mins|minut")
regex_half = re.compile("hlf|half")
regex_hh = re.compile("hh")
regex_slash = re.compile("/")
regex_hour = re.compile("hrs?$|hours?")
regex_hyphen = re.compile("-")
regex_common_minute_number = re.compile(
    "15$|15\D+|30$|30\D+|45$|45\D+|60$|60\D+|90$|90\D+")
regex_common_hour_spelled = re.compile("one|two|three")