def get_property_types(
    query: str,
    property_key: str,
    label_key: str
) -> List[Tuple]:
    """
    Returns a dictionary of datatype - property pairs
    """
    properties = []
    results = paged_query(query)
    num_results = len(results)
    for index, result in enumerate(results):

        uri = result[property_key]['value']
        label = result[label_key]['value']
        doc = nlp(label)
        lemmas = [token.lemma_ for token in doc]
        poss = [token.pos_ for token in doc]
        log.debug(
            f'Analyzing property {index} of {num_results}, {label} - {uri}')
    # try:
        num_references = get_number_of_property_references(uri)
        properties.append((uri, label, num_references, lemmas, poss))

    return properties
Exemplo n.º 2
0
    def __init__(self, text):

        self.original = str(text)
        self.docs = nlp(text)

        self.person_entities = self.getPersonEntities()

        # self.coreferences = []

        # for r in self.docs._.coref_clusters :
        #     coref = Coreference(r.main, r.mentions)
        #     if self.isValid(coref) : # only take valid coreference
        #         template = self.generateTemplate(coref)
        #         self.templates, self.mutants, self.names, self.countries = self.generateMutant(coref, template)
        #         break

        # self.coreferences = []
        self.person_coreferences = []
        self.person_coreferences = self.getPersonCoreferences()
        if len(self.person_coreferences) == 1:
            coref = self.person_coreferences[0]
            if self.isValid(coref):
                #                 print("XXXXXX")
                template = self.generateTemplate(coref)
                self.templates, self.mutants, self.names, self.countries = self.generateMutant(
                    coref, template)
Exemplo n.º 3
0
def NPs(caption):
    dataset = []
    doc = nlp(caption)
    for word in doc:
        if word.dep_ in ('xcomp', 'ccomp', 'pcomp', 'acomp'):
            subtree_span = doc[word.left_edge.i:word.right_edge.i + 1]
            print('clause ancestors', [t.text for t in word.ancestors])
            dataset.append(' '.join([t.text for t in subtree_span]))
        elif word.dep_ in ('ROOT'):
            left_subtree = [
                doc[w.left_edge.i:w.i + 1] for w in word.lefts
                if w.dep_ != 'aux'
            ]
            right_subtree = [doc[w.i:w.right_edge.i + 1] for w in word.rights]
            for l in itertools.product(left_subtree, right_subtree):
                dataset.append(' '.join([l[0].text, word.text, l[1].text]))
                dataset.append(' '.join([word.text, l[1].text]))
                dataset.append(' '.join([l[0].text, word.text]))
                dataset.append(' '.join([t.text
                                         for t in l[0].subtree] + [word.text]))
        # note: this is a failed attempt to extract local prepositional phrases
        # e.g. 'the dog with a frisbee in his mouth' -> 'the dog with a frisbee'
        # elif word.pos_ in ('ADP') and word.dep_ != 'prt':
        #     span = ([t.text for t in word.lefts]
        #             + [word.text] +
        #             [t.text for t in word.rights])
        #     dataset.append(' '.join([a.text for a in word.ancestors][:1] + span))
        #     dataset.append(' '.join([a.text for a in word.ancestors][:1] +
        # [a.text for a in word.subtree]))
    noun_chunks = [n.text for n in doc.noun_chunks if not n.root.is_stop] + \
                  [n.root.text for n in doc.noun_chunks if not n.root.is_stop]
    dataset = np.unique(dataset + [caption] + noun_chunks)

    # add original one if it's not already there
    return list(dataset)  # + [caption] if caption not in dataset else dataset
Exemplo n.º 4
0
 def isContainGenderAssociatedWord(self):
     if not self.tokens:
         self.tokens = nlp(self.phrase)
     tokens = self.tokens
     for token in tokens:
         #             print(token.text, token.pos_, token.dep_)
         if token.pos_ == "NOUN" and token.dep_ == "ROOT":
             if isInMasculineGenderAssosiatedWord(token.text):
                 self.gender_associated_word = token.text
                 return True
             if isInFeminineGenderAssosiatedWord(token.text):
                 self.gender_associated_word = token.text
                 return True
     return False
Exemplo n.º 5
0
 def isHasSalutation(self):
     if not self.tokens:
         self.tokens = nlp(self.phrase)
     tokens = self.tokens
     for token in tokens:
         if isInMasculineSalutation(token.text):
             self.salutation = token.text
             self.gender = "male"
             return True
     for token in tokens:
         if isInFeminineSalutation(token.text):
             self.salutation = token.text
             self.gender = "female"
             return True
     return False
Exemplo n.º 6
0
    def __init__(self, text):

        self.original = str(text)
        self.docs = nlp(text)

        self.person_entities = self.getPersonEntities()

        # self.coreferences = []
        self.person_coreferences = []
        self.person_coreferences = self.getPersonCoreferences()
        if len(self.person_coreferences) == 1:
            coref = self.person_coreferences[0]
            if self.isValid(coref):
                template = self.generateTemplate(coref)
                self.templates, self.mutants, self.genders = self.generateMutant(
                    coref, template)
Exemplo n.º 7
0
    if "--model" in sys.argv:
        model_arg = sys.argv.index("--model")
        model_file = sys.argv[model_arg + 1]
    if "--features" in sys.argv:
        features_arg = sys.argv.index("--features")
        features_file = sys.argv[features_arg + 1]

    clf = load(model_file)
    features_obj = load(features_file)
    feature_set, feature_hasher = features_obj
    if train.check_file(input_file):
        print "file is in wrong format. expected raw and not proccessed file"

    data = {}
    for sen_id, sen in utils.read_lines(sys.argv[1]):
        data[sen_id] = utils.nlp(sen)

    lexicon_helper = Lexicon_helper()
    extracted_ent_paris_svm = []
    feature_extractor = FeatureExtractor(lexicon_helper, feature_hasher,
                                         feature_set)
    sen_entities_with_x = spacy_parser.get_x_data(feature_extractor, data)
    sen_entities_with_x = sorted(sen_entities_with_x, key=utils.get_senid_int)
    allx = np.array([x[3].toarray()[0] for x in sen_entities_with_x])
    predicted_entities_pairs = clf.predict(allx)
    extracted_ent_paris_svm = filter_ent_pairs(predicted_entities_pairs,
                                               sen_entities_with_x)

    #Rules extraction
    extracted_ents_rules = rules_extractor.predict(data, lexicon_helper)
    extracted_ents_rules = sorted(extracted_ents_rules,
Exemplo n.º 8
0
 def getTokens(self):
     if not self.tokens:
         self.tokens = nlp(self.phrase)
     return self.tokens