Python load_dict示例，helper.dictionaries.load_dict Python示例

示例#1

0

显示文件

def main():
    # Load the dictionaries we need
    stopwords_dict = load_dict("stopwords")
    hpoterm_phenotype_abnormalities = load_dict(
        "hpoterm_phenotype_abnormalities")
    # Load the stemmer from NLTK
    stemmer = SnowballStemmer("english")
    if len(sys.argv) != 2:
        sys.stderr.write("USAGE: {} DICT\n".format(sys.argv[0]))
        sys.exit(1)
    with open(sys.argv[1], 'rt') as dict_file:
        for line in dict_file:
            # Skip empty lines
            if line.strip() == "":
                continue
            hpo_id, name, definition = line.strip().split("\t")
            # Skip if this is not a phenotypic abnormality
            if hpo_id not in hpoterm_phenotype_abnormalities:
                continue
            tokens = name.split()
            if len(tokens) == 1:
                name_stems = [
                    tokens[0].casefold(),
                ]
            else:
                # Compute the stems of the name
                name_stems = set()
                for word in tokens:
                    # Remove parenthesis and commas and colons
                    if word[0] == "(":
                        word = word[1:]
                    if word[-1] == ")":
                        word = word[:-1]
                    if word[-1] == ",":
                        word = word[:-1]
                    if word[-1] == ":":
                        word = word[:-1]
                    # Only process non stop-words AND single letters
                    if (word.casefold() not in stopwords_dict
                            and word not in ORDINALS) or len(word) == 1:
                        # split words that contain a "/"
                        if word.find("/") != -1:
                            for part in word.split("/"):
                                name_stems.add(stemmer.stem(part))
                        else:
                            name_stems.add(stemmer.stem(word))
            print("\t".join([hpo_id, name, "|".join(name_stems)]))

示例#2

0

显示文件

文件： hpoterms2mentions.py 项目： rionda/dd-genomics

def main():
    # Load the dictionaries we need
    stopwords_dict = load_dict("stopwords")
    hpoterm_phenotype_abnormalities = load_dict("hpoterm_phenotype_abnormalities")
    # Load the stemmer from NLTK
    stemmer = SnowballStemmer("english")
    if len(sys.argv) != 2:
        sys.stderr.write("USAGE: {} DICT\n".format(sys.argv[0]))
        sys.exit(1)
    with open(sys.argv[1], "rt") as dict_file:
        for line in dict_file:
            # Skip empty lines
            if line.strip() == "":
                continue
            hpo_id, name, definition = line.strip().split("\t")
            # Skip if this is not a phenotypic abnormality
            if hpo_id not in hpoterm_phenotype_abnormalities:
                continue
            tokens = name.split()
            if len(tokens) == 1:
                name_stems = [tokens[0].casefold()]
            else:
                # Compute the stems of the name
                name_stems = set()
                for word in tokens:
                    # Remove parenthesis and commas and colons
                    if word[0] == "(":
                        word = word[1:]
                    if word[-1] == ")":
                        word = word[:-1]
                    if word[-1] == ",":
                        word = word[:-1]
                    if word[-1] == ":":
                        word = word[:-1]
                    # Only process non stop-words AND single letters
                    if (word.casefold() not in stopwords_dict and word not in ORDINALS) or len(word) == 1:
                        # split words that contain a "/"
                        if word.find("/") != -1:
                            for part in word.split("/"):
                                name_stems.add(stemmer.stem(part))
                        else:
                            name_stems.add(stemmer.stem(word))
            print("\t".join([hpo_id, name, "|".join(name_stems)]))

示例#3

0

显示文件

文件： find_genes_in_hpoterms.py 项目： rionda/dd-genomics

#! /usr/bin/env python3

from helper.dictionaries import load_dict

if __name__ == "__main__":
    merged_genes_dict = load_dict("merged_genes")
    inverted_long_names = load_dict("inverted_long_names")
    hpoterms_orig = load_dict("hpoterms_orig")

    for long_name in inverted_long_names:
        for hpoterm_name in hpoterms_orig:
            if long_name in hpoterm_name.split() and \
                    long_name.casefold() != hpoterm_name.casefold():
                print("\t".join((long_name, hpoterm_name)))

示例#4

0

显示文件

文件： ext_gene_features.py 项目： amwenger/dd-genomics

GENE_KWS = frozenset([
    "gene", "oncogene", "protooncogene", "proto-oncogene", "pseudogene",
    "transgene"])

COEXPRESSION_KWS = frozenset([
    "expression", "overexpression", "over-expression", "co-expression",
    "coexpression"])


KEYWORDS = VAR_KWS | KNOCK_KWS | AMINO_ACID_KWS | ANTIGENE_KWS | DNA_KWS | \
    DOWNREGULATION_KWS | DOWNREGULATION_KWS | TUMOR_KWS | GENE_KWS | \
    COEXPRESSION_KWS

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
long_names_dict = load_dict("long_names")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")
stopwords_dict = load_dict("stopwords")


# Add features to a gene mention candidate
def add_features(mention_id, mention_words, sentence):
    # The verb closest to the candidate, with the path to it.
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \

示例#5

0

显示文件

文件： ext_gene_features.py 项目： NunoEdgarGFlowHub/dd-genomics

    "gene", "oncogene", "protooncogene", "proto-oncogene", "pseudogene",
    "transgene"
])

COEXPRESSION_KWS = frozenset([
    "expression", "overexpression", "over-expression", "co-expression",
    "coexpression"
])


KEYWORDS = VAR_KWS | KNOCK_KWS | AMINO_ACID_KWS | ANTIGENE_KWS | DNA_KWS | \
    DOWNREGULATION_KWS | DOWNREGULATION_KWS | TUMOR_KWS | GENE_KWS | \
    COEXPRESSION_KWS

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
long_names_dict = load_dict("long_names")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")
stopwords_dict = load_dict("stopwords")


# Add features to a gene mention candidate
def add_features(mention_id, mention_words, sentence):
    # The verb closest to the candidate, with the path to it.
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \

示例#6

0

显示文件

文件： extract_gene_mentions.py 项目： amit2014/dd-genomics

#    "EXT_KEYWORD_MIN_[activation]nn@",
#    "EXT_KEYWORD_MIN_[oligomerization]nn@",
#    "EXT_KEYWORD_MIN_[methylation]prep_of@",
#    "EXT_KEYWORD_MIN_[antibody]nn@",
#    "EXT_KEYWORD_MIN_[polymorphism]prep_of@",
#    "EXT_KEYWORD_MIN_[gene]appos@",
#    "EXT_KEYWORD_MIN_[enzyme]@nn",
#    "EXT_KEYWORD_MIN_[phosphorylation]prep_of@",
#    "EXT_KEYWORD_MIN_[receptor]@nn",
#    "EXT_KEYWORD_MIN_[histone]@nn",
#    "EXT_KEYWORD_MIN_[receptor]nn",
#    "IS_LONG_ALPHANUMERIC_MAIN_SYMBOL", "IS_HYPHENATED_SYMBOL", "IS_LONG_NAME"
#    ])

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
english_dict = load_dict("english")
stopwords_dict = load_dict("stopwords")
pos_mentions_dict = load_dict("pos_gene_mentions")
neg_mentions_dict = load_dict("neg_gene_mentions")
med_acrons_dict = load_dict("med_acrons")
long_names_dict = load_dict("long_names")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")

# Max mention length. We won't look at subsentences longer than this.
max_mention_length = 0
for key in merged_genes_dict:
    length = len(key.split())
    if length > max_mention_length:
        max_mention_length = length

示例#7

0

显示文件

文件： gene_hpoterm_relations.py 项目： amit2014/dd-genomics

        if frozenset([gene_mention.words[0].word, hpo_entity_id]) in \
                genehpoterms_dict:
            in_mapping = True
        else:
            for gene in gene_mention.entity.split("|"):
                if frozenset([gene, hpo_entity_id]) in \
                        genehpoterms_dict:
                    in_mapping = True
                    break
        if in_mapping:
            relation.is_correct = True
            relation.type = "GENEHPOTERM_SUP_MAP"


# Load the gene<->hpoterm dictionary
genehpoterms_dict = load_dict("genehpoterms")

if __name__ == "__main__":
    # Process input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "poses",
                       "ners", "lemmas", "dep_paths", "dep_parents",
                       "bounding_boxes", "gene_entities", "gene_wordidxss",
                       "gene_is_corrects", "gene_types",
                       "hpoterm_entities", "hpoterm_wordidxss",
                       "hpoterm_is_corrects", "hpoterm_types"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                 TSVstring2list, TSVstring2list, TSVstring2list,

示例#8

0

显示文件

文件： extract_gene_mentions.py 项目： rionda/dd-genomics

#    "EXT_KEYWORD_MIN_[activation]nn@",
#    "EXT_KEYWORD_MIN_[oligomerization]nn@",
#    "EXT_KEYWORD_MIN_[methylation]prep_of@",
#    "EXT_KEYWORD_MIN_[antibody]nn@",
#    "EXT_KEYWORD_MIN_[polymorphism]prep_of@",
#    "EXT_KEYWORD_MIN_[gene]appos@",
#    "EXT_KEYWORD_MIN_[enzyme]@nn",
#    "EXT_KEYWORD_MIN_[phosphorylation]prep_of@",
#    "EXT_KEYWORD_MIN_[receptor]@nn",
#    "EXT_KEYWORD_MIN_[histone]@nn",
#    "EXT_KEYWORD_MIN_[receptor]nn",
#    "IS_LONG_ALPHANUMERIC_MAIN_SYMBOL", "IS_HYPHENATED_SYMBOL", "IS_LONG_NAME"
#    ])

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
english_dict = load_dict("english")
stopwords_dict = load_dict("stopwords")
pos_mentions_dict = load_dict("pos_gene_mentions")
neg_mentions_dict = load_dict("neg_gene_mentions")
med_acrons_dict = load_dict("med_acrons")
long_names_dict = load_dict("long_names")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")

# Max mention length. We won't look at subsentences longer than this.
max_mention_length = 0
for key in merged_genes_dict:
    length = len(key.split())
    if length > max_mention_length:
        max_mention_length = length

示例#9

0

显示文件

文件： ext_gene_find_acronyms.py 项目： amit2014/dd-genomics

                            is_definition = False
                            break
                    definition = " ".join([w.word for w in window_words])
                    # Only consider this acronym if the definition is valid
                    if is_definition:
                        acronym = dict()
                        acronym["acronym"] = word.word
                        acronym["definition"] = definition
                        acronyms.append(acronym)
                        break
                    start_idx += 1
    return acronyms


# Load the genes dictionary
merged_genes_dict = load_dict("merged_genes")
inverted_long_names = load_dict("inverted_long_names")

if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line,
                ["doc_id", "sent_ids", "wordidxss", "wordss", "posess", 
                    "nerss", "lemmass", "dep_pathss", "dep_parentss",
                    "bounding_boxess"],
                [no_op, lambda x: TSVstring2list(x, int), 
                    lambda x: TSVstring2list(x,sep='!~!'), 
                    lambda x: TSVstring2list(x,sep='!~!'),

示例#10

0

显示文件

    "cronic", "deletion", "detection", "diagnose", "diagnosis", "disease",
    "drug", "family", "gene", "genome", "genomic", "genotype", "give", "grade",
    "group", "history", "infection", "inflammatory", "injury", "mutation",
    "pathway", "phenotype", "polymorphism", "prevalence", "protein", "risk",
    "severe", "stage", "symptom", "syndrome", "therapy", "therapeutic",
    "treat", "treatment", "variant"
    "viruses", "virus"
])

PATIENT_KWS = frozenset(
    ["boy", "girl", "man", "woman", "men", "women", "patient", "patients"])

KEYWORDS = VAR_KWS | PATIENT_KWS

# Load the dictionaries that we need
english_dict = load_dict("english")
stopwords_dict = load_dict("stopwords")
inverted_hpoterms = load_dict("hpoterms_inverted")
hponames_to_ids = load_dict("hponames_to_ids")
genes_with_hpoterm = load_dict("genes_with_hpoterm")
# hpodag = load_dict("hpoparents")

stems = set()
for hpo_name in inverted_hpoterms:
    stem_set = inverted_hpoterms[hpo_name]
    stems |= stem_set
stems = frozenset(stems)

# The keys of the following dictionary are sets of stems, and the values are
# sets of hpoterms whose name, without stopwords, gives origin to the
# corresponding set of stems (as key)

示例#11

0

显示文件

        if frozenset([gene_mention.words[0].word, hpo_entity_id]) in \
                genehpoterms_dict:
            in_mapping = True
        else:
            for gene in gene_mention.entity.split("|"):
                if frozenset([gene, hpo_entity_id]) in \
                        genehpoterms_dict:
                    in_mapping = True
                    break
        if in_mapping:
            relation.is_correct = True
            relation.type = "GENEHPOTERM_SUP_MAP"


# Load the gene<->hpoterm dictionary
genehpoterms_dict = load_dict("genehpoterms")

if __name__ == "__main__":
    # Process input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line,
                [
                    "doc_id", "sent_id", "wordidxs", "words", "poses", "ners",
                    "lemmas", "dep_paths", "dep_parents", "bounding_boxes",
                    "gene_entities", "gene_wordidxss", "gene_is_corrects",
                    "gene_types", "hpoterm_entities", "hpoterm_wordidxss",
                    "hpoterm_is_corrects", "hpoterm_types"
                ],

示例#12

0

显示文件

#! /usr/bin/env python3
#
# Extract, add features to, and supervise mentions extracted from geneRifs.
#

import fileinput

from dstruct.Sentence import Sentence
from extract_gene_mentions import extract
from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op
from helper.dictionaries import load_dict

if __name__ == "__main__":
    # Load the merged genes dictionary
    merged_genes_dict = load_dict("merged_genes")
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "gene"], [
                    no_op, int, lambda x: TSVstring2list(x, int),
                    TSVstring2list, no_op
                ])
            # Create the Sentence object
            null_list = [
                None,
            ] * len(line_dict["wordidxs"])
            sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                                line_dict["wordidxs"], line_dict["words"],
                                null_list, null_list, null_list, null_list,

示例#13

0

显示文件

文件： ext_gene_candidates.py 项目： NunoEdgarGFlowHub/dd-genomics

from dstruct.Mention import Mention
from dstruct.Sentence import Sentence
from helper.dictionaries import load_dict
from helper.easierlife import get_all_phrases_in_sentence, \
    get_dict_from_TSVline, TSVstring2list, no_op

DOC_ELEMENTS = frozenset(
    ["figure", "table", "figures", "tables", "fig", "fig.", "figs", "figs.",
     "file", "movie"])

INDIVIDUALS = frozenset(["individual", "individuals", "patient"])

TYPES = frozenset(["group", "type", "class", "method"])

# Load the dictionaries that we need
merged_genes_dict = load_dict("merged_genes")
inverted_long_names = load_dict("inverted_long_names")
hpoterms_with_gene = load_dict("hpoterms_with_gene")
english_dict = load_dict("english")

# Max mention length. We won't look at subsentences longer than this.
max_mention_length = 0
for key in merged_genes_dict:
    length = len(key.split())
    if length > max_mention_length:
        max_mention_length = length
# doubling to take into account commas and who knows what
max_mention_length *= 2


# Supervise the candidates.

示例#14

0

显示文件

                            is_definition = False
                            break
                    definition = " ".join([w.word for w in window_words])
                    # Only consider this acronym if the definition is valid
                    if is_definition:
                        acronym = dict()
                        acronym["acronym"] = word.word
                        acronym["definition"] = definition
                        acronyms.append(acronym)
                        break
                    start_idx += 1
    return acronyms


# Load the genes dictionary
merged_genes_dict = load_dict("merged_genes")
inverted_long_names = load_dict("inverted_long_names")

if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(line, [
                "doc_id", "sent_ids", "wordidxss", "wordss", "posess", "nerss",
                "lemmass", "dep_pathss", "dep_parentss", "bounding_boxess"
            ], [
                no_op, lambda x: TSVstring2list(x, int),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),

示例#15

0

显示文件

文件： extract_hpoterm_mentions.py 项目： amit2014/dd-genomics

    "abnormality", "affect", "apoptosis", "association", "cancer", "carcinoma",
    "case", "cell", "chemotherapy", "clinic", "clinical", "chromosome",
    "cronic", "deletion", "detection", "diagnose", "diagnosis", "disease",
    "drug", "family", "gene", "genome", "genomic", "genotype", "give", "grade",
    "group", "history", "infection", "inflammatory", "injury", "mutation",
    "pathway", "phenotype", "polymorphism", "prevalence", "protein", "risk",
    "severe", "stage", "symptom", "syndrome", "therapy", "therapeutic",
    "treat", "treatment", "variant" "viruses", "virus"])

PATIENT_KWS = frozenset(
    ["boy", "girl", "man", "woman", "men", "women", "patient", "patients"])

KEYWORDS = VAR_KWS | PATIENT_KWS

# Load the dictionaries that we need
english_dict = load_dict("english")
stopwords_dict = load_dict("stopwords")
inverted_hpoterms = load_dict("hpoterms_inverted")
hponames_to_ids = load_dict("hponames_to_ids")
genes_with_hpoterm = load_dict("genes_with_hpoterm")
# hpodag = load_dict("hpoparents")


stems = set()
for hpo_name in inverted_hpoterms:
    stem_set = inverted_hpoterms[hpo_name]
    stems |= stem_set
stems = frozenset(stems)

# The keys of the following dictionary are sets of stems, and the values are
# sets of hpoterms whose name, without stopwords, gives origin to the

示例#16

0

显示文件

文件： ext_geneRifs_candidates.py 项目： amwenger/dd-genomics

#! /usr/bin/env python3
#
# Extract, add features to, and supervise mentions extracted from geneRifs.
#

import fileinput

from dstruct.Sentence import Sentence
from extract_gene_mentions import extract
from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op
from helper.dictionaries import load_dict

if __name__ == "__main__":
    # Load the merged genes dictionary
    merged_genes_dict = load_dict("merged_genes")
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "gene"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                    no_op])
            # Create the Sentence object
            null_list = [None, ] * len(line_dict["wordidxs"])
            sentence = Sentence(
                line_dict["doc_id"], line_dict["sent_id"],
                line_dict["wordidxs"], line_dict["words"], null_list,
                null_list, null_list, null_list, null_list, null_list)
            # This is the 'labelled' gene that we know is in the sentence
            gene = line_dict["gene"]

示例#17

0

显示文件

文件： canonicalize.py 项目： HazyResearch/dd-genomics

#! /usr/bin/env python3
#
# Canonicalize a dump using the HPO dag
#
# Use the output of filter_out_uncertain_genes.py


import sys

from helper.dictionaries import load_dict

if len(sys.argv) != 2:
    sys.stderr.write("USAGE: {} dump.tsv\n".format(sys.argv[0]))
    sys.exit(1)

hpoancestors = load_dict("hpoancestors")

with open(sys.argv[1], 'rt') as dump:
    for line in dump:
        tokens = line.strip().split("\t")
        relation_id = tokens[0]
        gene_entity = tokens[1]
        hpo_entity = tokens[3]
        if "|" not in hpo_entity:
            continue
        hpo_id = hpo_entity.split("|")[0]
        if hpo_id not in hpoancestors:
            continue
        print("{}\t{}\t{}".format(relation_id, gene_entity, hpo_entity))
        for ancestor in hpoancestors[hpo_id]:
            print("{}\t{}\t{}".format(relation_id, gene_entity, ancestor))

示例#18

0

显示文件

文件： extract_rates.py 项目： raphaelhoffmann/dd-benchmark

threeh_suffixes = threeh_singles

full_singles = frozenset(("outcalls", "outcall", "overnite"))
full_suffixes = full_singles

two_grams_prefixes = frozenset((
    "1/2", "1/2", "half", "1.5", "hlf", "full", "an hour", "a hr", "whole", "multiple", "additional", "first", "multi", "add", "complete"))
two_grams_durations = frozenset((
    "quick visits", "quick visit", "quick fix", "short stay", "short visit", "quick stay", "short fix"))

singles = quick_singles | half_singles | hour_singles | fourtyfive_singles | \
    twoh_singles | threeh_singles
suffixes = quick_suffixes | half_suffixes | hour_suffixes | \
    fourtyfive_suffixes | twoh_suffixes | threeh_suffixes | min_suffixes

STOP_WORDS = load_dict("stopwords")
ALL_ENGLISH_WORDS = load_dict("english") - \
    (singles | suffixes | two_grams_prefixes | {"quick", })

sregex_1w_num_min = re.compile("^(15|20|30|45|60|90|120)/?-?min")
sregex_1w_num_hour = re.compile("^(1|2|3|one|two|three)/?-?h")

regex_min = re.compile("min$|mins|minut")
regex_half = re.compile("hlf|half")
regex_hh = re.compile("hh")
regex_slash = re.compile("/")
regex_hour = re.compile("hrs?$|hours?")
regex_hyphen = re.compile("-")
regex_common_minute_number = re.compile(
    "15$|15\D+|30$|30\D+|45$|45\D+|60$|60\D+|90$|90\D+")
regex_common_hour_spelled = re.compile("one|two|three")