示例#1
0
def pipeline1(text, r, t):
    extractedRelations = []
    with CoreNLPClient(
            annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner'],
            timeout=450000,
            memory='4G',
            endpoint="http://localhost:9000",
            threads=7) as pipeline1:
        print(
            "\tAnnotating the webpage using [tokenize, ssplit, pos, lemma, ner] annotators ..."
        )
        ann = pipeline1.annotate(text)
        sentenceNumber = len(ann.sentence)
        namedEntity = patterns[toRelation[r]]
        print(
            "\tExtracted %d sentences. Processing each sentence one by one to check for presence of right pair of named entity types; if so, will run the second pipeline ..."
            % sentenceNumber)

        # if a sentence has the targeted two named entities
        # add the sentence to the list that the element of which perform extracting kbp annotations
        processedSentence = []
        for i, sentence in enumerate(ann.sentence):
            # check if those named entity in the query all appear in the extract sentence
            firstEntity = False
            secondEntity = False
            for token in sentence.token:
                if toRelation[r] == relation[2]:
                    if token.ner == namedEntity[0]:
                        firstEntity = True
                    if token.ner in namedEntity[1]:
                        secondEntity = True
                else:
                    if token.ner == namedEntity[0]:
                        firstEntity = True
                    if token.ner == namedEntity[1]:
                        secondEntity = True

            # if both targeted named entity appear, the sentence adds to the list
            if firstEntity and secondEntity:
                processedSentence.append([i, to_text(sentence)])

        # extract the relations in the list of sentence through pipeline2
        extractedRelations += pipeline2(processedSentence, t)
        print("Extracted kbp annotations for %d out of total %d sentences" %
              (len(processedSentence), sentenceNumber))

    return extractedRelations
示例#2
0
def checkKBPConfidence(ann_kbp, r, counterExtractedTuples):
    # check confidence > threshold
    for sentence in ann_kbp.sentence:
        for kbp_triple in sentence.kbpTriple:
            if kbp_triple.relation == r:
                print("\t=== Extracted Relation ===")
                print("\tSentence: ", to_text(sentence))
                print(
                    f"\tConfidence: {kbp_triple.confidence}; Subject: {kbp_triple.subject}; Object: {kbp_triple.object}"
                )
                if kbp_triple.confidence > t:
                    # Update Confidence if possible (Higher ONLY)
                    if str(kbp_triple.subject) + "," + str(
                            kbp_triple.object) in extractedTuples:
                        if kbp_triple.confidence > extractedTuples[
                                str(kbp_triple.subject) + "," +
                                str(kbp_triple.object)]:
                            extractedTuples[
                                str(kbp_triple.subject) + "," +
                                str(kbp_triple.object)] = kbp_triple.confidence
                            print(
                                "The same relation is already present but with a lower confidence. Just updating the confident value."
                            )
                        else:
                            print(
                                "The same relation is already present with higher (or equal) confidence. Ignoring this."
                            )

                    # Brand new relation
                    else:
                        extractedTuples[str(kbp_triple.subject) + "," + str(
                            kbp_triple.object)] = kbp_triple.confidence
                        print("\tAdding to set of extracted relations")
                        counterExtractedTuples += 1
                else:
                    print(
                        "\tConfidence is lower than threshold confidence. Ignoring this."
                    )
                print("\t==========")
    return counterExtractedTuples
示例#3
0
def test_update():
    with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client:
        ann = client.annotate(TEXT)
        ann = client.update(ann)
        assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
示例#4
0
def process_urls(res, relationdict, r, t):
    name_dict = {
        '1': ["ORGANIZATION", "PERSON"],
        '2': ["ORGANIZATION", "PERSON"],
        '3': ["PERSON", "CITY"],
        '4': ["ORGANIZATION", "PERSON"]
    }
    X = defaultdict(float)
    count = 1
    with CoreNLPClient(
            annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner'],
            timeout=30000,
            memory='4G',
            endpoint="http://localhost:9000") as pipeline_ner:
        with CoreNLPClient(annotators=[
                'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'depparse',
                'coref', 'kbp'
        ],
                           timeout=30000,
                           memory='4G',
                           endpoint="http://localhost:9001") as pipeline_kbp:
            for results in res['items']:
                url = results['link']
                title = results['title']
                print("URL (", count, "/ 10):", url)
                print("\tFetching text from url ...")

                ## gary please fill in
                webpage_length = 0

                # print('result ' + str(count))
                # print('title: '+title)
                # print('url: '+url)
                count += 1
                # now get contents using tika
                try:
                    parsed = parser.from_file(url)
                except:
                    print("Unable to fetch URL. Continuing.")
                    continue
                content = parsed["content"]
                strip_content = ' '.join(content.split())
                print(len(strip_content))
                # get first 20000 characters
                if len(strip_content) >= 20000:
                    strip_content = strip_content[:20000]
                print("\tWebpage length (num characters):", len(strip_content))
                print(
                    "\tAnnotating the webpage using [tokenize, ssplit, pos, lemma, ner] annotators ..."
                )
                # use package now

                # for j in range(10): # this number can be changed back to 10 if needed
                #     try:
                #print(f">>> Repeating {j}th time.")
                ann_ner = pipeline_ner.annotate(strip_content)
                countkbp = 0
                print(
                    "\tExtracted", len(ann_ner.sentence),
                    "sentences. Processing each sentence one by one to check for presence of right pair of named entity types; if so, will run the second pipeline ..."
                )
                for sentence in ann_ner.sentence:
                    #print("matching ners...")
                    match = [False] * len(name_dict[r])
                    for token in sentence.token:
                        for i in range(len(name_dict[r])):
                            if token.ner == name_dict[r][i]:
                                match[i] = True
                    if all(match):
                        try:
                            ann = pipeline_kbp.annotate(to_text(sentence))
                        except:
                            continue
                        countsentences = 0
                        countkbp += 1
                        for i in ann.sentence:
                            if (countsentences % 5
                                    == 0) and (countsentences != 0):
                                print("\tProcessed", countsentences + 1, "/",
                                      len(ann_ner.sentence), "sentences")
                            elif (countsentences == len(ann.sentence)):
                                print("\tProcessed", len(ann_ner.sentence),
                                      "/", len(ann_ner.sentence), "sentences")
                            for kbp_triple in i.kbpTriple:
                                if kbp_triple.relation == relationdict[r]:

                                    print("\t\t=== Extracted Relation ===")
                                    print(
                                        "\t\tSentence:", to_text(i)
                                    )  ########## not sure if this is right, please check
                                    print("\t\tConfidence:",
                                          kbp_triple.confidence, "; Subject:",
                                          kbp_triple.subject, "; Object:",
                                          kbp_triple.object, ";")
                                    if kbp_triple.confidence > float(t):
                                        if X[(kbp_triple.subject,
                                              kbp_triple.relation, kbp_triple.
                                              object)] < kbp_triple.confidence:
                                            #update key value now
                                            X[(kbp_triple.subject,
                                               kbp_triple.relation,
                                               kbp_triple.object
                                               )] = kbp_triple.confidence
                                            print(
                                                "\t\tAdding to set of extracted relations"
                                            )
                                            #print((kbp_triple.subject,kbp_triple.relation,kbp_triple.object,kbp_triple.confidence))
                                        else:
                                            print(
                                                "\t\tDuplicate with lower confidence than existing record. Ignoring this."
                                            )
                                    else:
                                        print(
                                            "\t\tConfidence is lower than threshold confidence. Ignoring this."
                                        )
                                    print("\t\t==========")
                            countsentences += 1
                print("\tExtracted kbp annotations for ", countkbp,
                      " out of total ", len(ann_ner.sentence), " sentences")

    return X
示例#5
0
def test_update(corenlp_client):
    ann = corenlp_client.annotate(TEXT)
    ann = corenlp_client.update(ann)
    assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
示例#6
0
def test_context_manager():
    with corenlp.CoreNLPClient(annotators="tokenize,ssplit") as context_client:
        ann = context_client.annotate(TEXT)
        assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
示例#7
0
def main(api_key, engine_id, r, t, q, k):
    queryIteration = 0
    pageNumberVisited = 0
    r = relations[r]
    counterExtractedTuples = 0
    visitedTuples.add(q)

    # Initial Print of Parameters
    print("\nParameters:")
    print("Client key   = ", api_key)
    print("Engine key   = ", engine_id)
    print("Relation     = ", r)
    print("Threshold    = ", t)
    print("Query        = ", q)
    print("# of Tuples  = ", k)
    print("Loading necessary libraries; This should take a minute or so ...")

    while (len(extractedTuples) < k) and (queryIteration < 9):
        print("=========== Iteration: %s - Query: %s ===========" %
              (queryIteration, q))

        # Google search API (returns top 10 pages)
        service = build("customsearch", "v1", developerKey=api_key)

        res = service.cse().list(
            q=q,
            cx=engine_id,
        ).execute()

        # Dissect the 10 pages
        for page in res['items']:

            # Print URL and mark as visited
            url = page['formattedUrl']
            if (url in visitedURLs):
                print("Already seen URL ... skipping")
                continue
            else:
                visitedURLs.add(url)
            pageNumberVisited += 1
            print(
                f"URL ({pageNumberVisited} / {(queryIteration+1)*10}): {url}")

            # Get the 20000 characters from page
            print("Fetching text from url ...")
            try:
                rawPage = requests.get(url)
            except:
                print("Unable to fetch URL. Continuing.")
                continue
            contents = BeautifulSoup(rawPage.text, 'html.parser')
            pageText = contents.findAll(text=True)
            rawText = filter(tag_visible, pageText)
            rawText = u" ".join(t.strip() for t in rawText)

            if len(rawText) > 20000:
                print((
                    "Truncating webpage text from size (num characters) %s to 20000 ..."
                ), len(rawText))
                rawText = rawText[:20000]

            print("Webpage length (num characters):", len(rawText))

            # Do the annotation
            print(
                "Annotating the webpage using [tokenize, ssplit, pos, lemma, ner] annotators ..."
            )

            counterLastIterationExtractedTuples = counterExtractedTuples

            # TA PROVIDED CODE For KBP TRIPLE EXTRACTION
            try:
                with CoreNLPClient(timeout=30000, memory='4G',
                                   be_quiet=True) as pipeline:
                    ann_ner = pipeline.annotate(rawText,
                                                annotators=annotators_ner)
                    # print 330 lines

                    # check ner tags  ( CHECK THE RELATION HERE )
                    for sentence in ann_ner.sentence:
                        sentenceText = to_text(sentence)
                        #print("Sentence: ", sentenceText)

                        for token in sentence.token:
                            #print(f"****Token word:: {token.word};\t ner : {token.ner}; ")
                            if (token.ner not in sentenceNers):
                                if r == "per:cities_of_residence":
                                    if (token.ner in nersRelationtype2):
                                        sentenceNers.add(token.ner)
                                else:
                                    if (token.ner in nersRelationtype1):
                                        sentenceNers.add(token.ner)
                        #print("Sentence NERS: ", sentenceNers)

                        if r == "per:cities_of_residence":
                            if (nersRelationtype2[0] in sentenceNers) and (nersRelationtype2[1] in sentenceNers) \
                                or (nersRelationtype2[2] in sentenceNers) or (nersRelationtype2[3] in sentenceNers):
                                # KBP Annotate for more detailed analysis
                                ann_kbp = pipeline.annotate(
                                    sentenceText, annotators=annotators_kbp)
                                counterExtractedTuples = checkKBPConfidence(
                                    ann_kbp, r, counterExtractedTuples)
                            else:
                                #print("~~~~~~NO MATCHING NERS")
                                pass
                        else:
                            if nersRelationtype1[
                                    0] in sentenceNers and nersRelationtype1[
                                        1] in sentenceNers:
                                # KBP Annotate for more detailed analysis
                                ann_kbp = pipeline.annotate(
                                    sentenceText, annotators=annotators_kbp)
                                counterExtractedTuples = checkKBPConfidence(
                                    ann_kbp, r, counterExtractedTuples)
                            else:
                                #print("~~~~~~NO MATCHING NERS")
                                pass
                        sentenceNers.clear()
                # End of webPage
                print(
                    f"Relations extracted from this website {counterExtractedTuples - counterLastIterationExtractedTuples} (Overall: {counterExtractedTuples})"
                )
            except:
                print("Timeout Stanford NLP Server --- Continuing")
                pass

        # Next iteration, need new query based off high confidence tuple
        queryIteration += 1
        sortedTuples = sortByConfidence(extractedTuples)
        for i in sortedTuples:
            newQuery = ' '.join(i[0].lower().split(','))
            if newQuery not in visitedTuples:
                q = newQuery
                visitedTuples.add(q)
                break

    # End results
    finalResultsPrint(sortedTuples)