Python Core.process 예제들, etk.core.Core.process Python 예제들

예제 #1

0

파일 보기

 def test_guards(self):
     c = Core(extraction_config=self.e_config)
     r = c.process(self.doc1)
     self.assertTrue("knowledge_graph" in r)
     self.assertTrue("event_date" in r['knowledge_graph'])
     r = c.process(self.doc2)
     self.assertTrue("knowledge_graph" not in r
                     or "event_date" not in r['knowledge_graph'])

예제 #2

0

파일 보기

 def test_document_id(self):
     e_config = {'document_id': 'doc_id'}
     c = Core(extraction_config=e_config)
     r = c.process(self.doc)
     self.assertTrue('document_id' in r)
     doc_id = '1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21'
     self.assertEqual(r['document_id'], doc_id)

예제 #3

0

파일 보기

    def test_ce_readability(self):
        e_config = {
            'content_extraction': {
                "input_path": "raw_content",
                "extractors": {
                    "readability": [{
                        "strict": "yes",
                        "extraction_policy": "keep_existing"
                    }, {
                        "strict": "no",
                        "extraction_policy": "keep_existing",
                        "field_name": "content_relaxed"
                    }]
                }
            }
        }
        c = Core(extraction_config=e_config)
        r = c.process(self.doc)
        self.assertTrue('tld' in r)
        self.assertEqual('eroticmugshots.com', r['tld'])
        self.assertTrue("content_extraction" in r)
        self.assertTrue("content_strict" in r["content_extraction"])
        self.assertTrue("content_relaxed" in r["content_extraction"])
        self.assertTrue("title" not in r["content_extraction"])
        self.assertTrue("inferlink_extractions" not in r["content_extraction"])

        c_s = """\n \n \n \n \n \n smoothlegs24  28 \n \n \n chrissy391  27 \n \n \n My name is Helena height 160cms weight 55 kilos  contact me at [email protected]           jefferson ave         age: 23 HrumpMeNow  28 \n \n \n xxtradition  24 \n \n \n jumblyjumb  26 \n \n \n claudia77  26 \n \n \n gushinPuss  28 \n \n \n Littlexdit  25 \n \n \n PinkSweets2  28 \n \n \n withoutlimit  27 \n \n \n bothOfUs3  28 \n \n \n lovelylips  27 \n \n \n killerbod  27 \n \n \n Littlexdit  27 \n \n \n azneyes  23 \n \n \n \n \n \n Escort's Phone: \n \n \n323-452-2013  \n \n Escort's Location: \nLos Angeles, California  \n Escort's Age:   23   Date of Escort Post:   Jan 02nd 6:46am \n REVIEWS:   \n READ AND CREATE REVIEWS FOR THIS ESCORT   \n \n \n \n \n \nThere are  50  girls looking in  .\n VIEW GIRLS \n \nHey I'm luna 3234522013 Let's explore , embrace and indulge in your favorite fantasy  % independent. discreet no drama Firm Thighs and Sexy. My Soft skin & Tight Grip is exactly what you deserve Call or text   Fetish friendly   Fantasy friendly   Party friendly 140 Hr SPECIALS 3234522013. Call  323-452-2013 .  Me and my friends are on EZsex  soooo you can find us all on there if you want... skittlegirl \n   \n \n   \n \n   \n Call me on my cell at 323-452-2013. Date of ad: 2017-01-02 06:46:00 \n \n \n"""
        c_r = """\n \n \n \n \n \n \n smoothlegs24  28 \n \n \n chrissy391  27 \n \n \n My name is Helena height 160cms weight 55 kilos  contact me at [email protected]           jefferson ave         age: 23 HrumpMeNow  28 \n \n \n xxtradition  24 \n \n \n jumblyjumb  26 \n \n \n claudia77  26 \n \n \n gushinPuss  28 \n \n \n Littlexdit  25 \n \n \n PinkSweets2  28 \n \n \n withoutlimit  27 \n \n \n bothOfUs3  28 \n \n \n lovelylips  27 \n \n \n killerbod  27 \n \n \n Littlexdit  27 \n \n \n azneyes  23 \n \n \n \n \n \n Escort's Phone: \n \n \n323-452-2013  \n \n Escort's Location: \nLos Angeles, California  \n Escort's Age:   23   Date of Escort Post:   Jan 02nd 6:46am \n REVIEWS:   \n READ AND CREATE REVIEWS FOR THIS ESCORT   \n \n \n \n \n \nThere are  50  girls looking in  .\n VIEW GIRLS \n \nHey I'm luna 3234522013 Let's explore , embrace and indulge in your favorite fantasy  % independent. discreet no drama Firm Thighs and Sexy. My Soft skin & Tight Grip is exactly what you deserve Call or text   Fetish friendly   Fantasy friendly   Party friendly 140 Hr SPECIALS 3234522013. Call  323-452-2013 .  Me and my friends are on EZsex  soooo you can find us all on there if you want... skittlegirl \n   \n \n   \n \n   \n Call me on my cell at 323-452-2013. Date of ad: 2017-01-02 06:46:00 \n \n \n \n"""
        self.assertEqual(
            json.dumps(r["content_extraction"]["content_strict"]["text"]),
            json.dumps(c_s))
        self.assertEqual(
            json.dumps(r["content_extraction"]["content_relaxed"]["text"]),
            json.dumps(c_r))

예제 #4

0

파일 보기

 def test_tld_extraction_from_doc(self):
     doc = {
         "url": "https://www.google.com/blah/this/part/doesnt/matter",
         'uri': "uri.1",
         "tld": "xyz.org"
     }
     e_config = {
         "document_id":
         "uri",
         "content_extraction": {},
         "data_extraction": [{
             "input_path": "content_extraction.url.text.`parent`",
             "fields": {
                 "website": {
                     "extractors": {
                         "extract_website_domain": {}
                     }
                 }
             }
         }]
     }
     c = Core(extraction_config=e_config)
     r = c.process(doc)
     self.assertEqual(r['knowledge_graph']['website'][0]['value'],
                      'xyz.org')

예제 #5

0

파일 보기

파일: test_kg_enhancement.py 프로젝트: vlall/etk

    def test_add_constants(self):
        e_config = {
            "document_id": "doc_id",
            "kg_enhancement": {
                "fields": {
                    "type": {
                        "priority": 0,
                        "extractors": {
                            "add_constant_kg": {
                                "config": {
                                    "constants": ["Type A", "Type B"]
                                }
                            }
                        }
                    }
                },
                "input_path": "knowledge_graph.`parent`"
            }}

        c = Core(extraction_config=e_config)
        r = c.process(self.doc)
        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('type' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['type']) == 2)
        self.assertTrue(self.doc['knowledge_graph']['type'][0]['value'] in ["Type A", "Type B"])

예제 #6

0

파일 보기

파일: test_content_extractions.py 프로젝트: rahulrawat11/etk

 def test_document_id_not_present(self):
     e_config = {
         'document_id': 'blah'
     }
     c = Core(extraction_config=e_config)
     with self.assertRaises(KeyError):
         r = c.process(self.doc)

예제 #7

0

파일 보기

파일: test_kg_enhancement.py 프로젝트: vlall/etk

 def test_filter_results(self):
     c = Core(extraction_config=self.e_config)
     r = c.process(self.doc)
     self.assertTrue('knowledge_graph' in self.doc)
     self.assertTrue('name' in self.doc['knowledge_graph'])
     self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)
     self.assertTrue(self.doc['knowledge_graph']['name'][0]['confidence'] == 1.0)

예제 #8

0

파일 보기

파일: test_extractions_input_paths.py 프로젝트: vlall/etk

    def test_invalid_json_path(self):
        doc = {
            "url": "http:www.hitman.org",
            "doc_id": "19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E",
            "actors": {
                "name": "agent 47",
                "affiliation": "International Contract Agency"
            }
        }

        e_config = {
            "document_id": "doc_id",
            "data_extraction": [
                {
                    "input_path": [
                        "actors["
                    ],
                    "fields": {
                        "actors": {
                            "extractors": {
                                "create_kg_node_extractor": {
                                    "config": {
                                        "segment_name": "actor_information"
                                    }
                                }
                            }
                        }
                    }
                }
            ]
        }
        c = Core(extraction_config=e_config)

        with self.assertRaises(Exception):
            r = c.process(doc)

예제 #9

0

파일 보기

파일: test_extractions_using_spacy.py 프로젝트: vlall/etk

 def test_spacy_date(self):
     doc = {
         "url": "http://date.test.com",
         "doc_id": "12344",
         "content_extraction": {
             "useful_text": {
                 "text": u"Alert: Tue, 2006-02-07"
             }
         }
     }
     e_config = {
         "document_id":
         "doc_id",
         'data_extraction': [{
             "fields": {
                 "event_date": {
                     "extractors": {
                         "extract_using_spacy": {
                             "config": {
                                 "post_filter": "parse_date"
                             }
                         }
                     }
                 }
             },
             "input_path": ["content_extraction.useful_text.text.`parent`"]
         }]
     }
     core = Core(extraction_config=e_config)
     r = core.process(doc)
     kg = r['knowledge_graph']
     self.assertTrue('event_date' in kg)
     self.assertEqual(kg['event_date'][0]['value'], '2006-02-07T00:00:00')

예제 #10

0

파일 보기

파일: test_extractions_using_regex.py 프로젝트: rahulrawat11/etk

 def test_extractor__no_regex(self):
     e_config = {
         "data_extraction": [{
             "input_path":
             ["content_extraction.content_strict.text.`parent`"],
             "fields": {
                 "name": {
                     "extractors": {
                         "extract_using_regex": {
                             "config": {
                                 "include_context":
                                 "true",
                                 "regex_options": ["IGNORECASE"],
                                 "pre_filter": [
                                     "x.replace('\\n', '')",
                                     "x.replace('\\r', '')"
                                 ]
                             },
                             "extraction_policy": "replace"
                         }
                     }
                 }
             }
         }]
     }
     c = Core(extraction_config=e_config)
     with self.assertRaises(KeyError):
         r = c.process(self.doc)

예제 #11

0

파일 보기

파일: test_extractions_using_regex.py 프로젝트: vinayraod/etk

 def test_extractor_regex(self):
     e_config = {
         "data_extraction": [{
             "input_path":
             ["content_extraction.content_strict.text.`parent`"],
             "fields": {
                 "name": {
                     "extractors": {
                         "extract_using_regex": {
                             "config": {
                                 "include_context":
                                 "true",
                                 "regex":
                                 "(?:my[\\s]+name[\\s]+is[\\s]+([-a-z0-9@$!]+))",
                                 "regex_options": ["IGNORECASE"],
                                 "pre_filter": [
                                     "x.replace('\\n', '')",
                                     "x.replace('\\r', '')"
                                 ]
                             },
                             "extraction_policy": "replace"
                         }
                     }
                 }
             }
         }]
     }
     c = Core(extraction_config=e_config)
     r = c.process(self.doc)
     self.assertTrue("content_extraction" in r)
     self.assertTrue("content_strict" in r["content_extraction"])
     self.assertTrue("text" in r["content_extraction"]["content_strict"])
     self.assertTrue("tokens" in r["content_extraction"]["content_strict"])
     self.assertTrue(
         "simple_tokens" in r["content_extraction"]["content_strict"])
     self.assertTrue(
         "data_extraction" in r["content_extraction"]["content_strict"])
     self.assertTrue("name" in r["content_extraction"]["content_strict"]
                     ["data_extraction"])
     self.assertTrue("extract_using_regex" in r["content_extraction"]
                     ["content_strict"]["data_extraction"]["name"])
     extraction = r["content_extraction"]["content_strict"][
         "data_extraction"]["name"]["extract_using_regex"]
     ex = {
         "results": [{
             "origin": {
                 "score": 1,
                 "segment": "readability_strict",
                 "method": "other_method"
             },
             "context": {
                 'text': u' 27 \n \n \n My name is Helena height 16',
                 "end": 73,
                 "start": 56
             },
             "value": "Helena"
         }]
     }
     self.assertEqual(extraction, ex)

예제 #12

0

파일 보기

파일: test_extractions_from_table.py 프로젝트: vlall/etk

    def test_table_extractor_empty_config(self):
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue("content_extraction" in r)
        self.assertTrue("table" in r["content_extraction"])
        ex = json.loads(json.JSONEncoder().encode(r["content_extraction"]["table"]["tables"]))
        self.assertEqual(ex, self.table_ex)

예제 #13

0

파일 보기

 def test_decode_value_dictionary(self):
     women_name_file_path = os.path.join(
         os.path.dirname(__file__),
         "resources/case_sensitive_female_name.json")
     name_decoding_dict_path = os.path.join(os.path.dirname(__file__),
                                            "resources/name_decode.json")
     doc = {
         'content_extraction': {
             'content_strict': {
                 'text':
                 'My name is Margie and this is a test for extracting this name using case sensitive '
                 'dictionary'
             }
         },
         'doc_id': 'id',
         'url': 'http://givemeabreak.com'
     }
     e_config = {
         "resources": {
             "dictionaries": {
                 "women_name": women_name_file_path
             },
             "decoding_dictionary": {
                 "name": name_decoding_dict_path
             }
         },
         "document_id":
         "doc_id",
         "data_extraction": [{
             "input_path":
             "content_extraction.content_strict.text.`parent`",
             "fields": {
                 "name": {
                     "extractors": {
                         "extract_using_dictionary": {
                             "config": {
                                 'case_sensitive': 'True',
                                 "dictionary": "women_name",
                                 "ngrams": 1,
                                 "joiner": " ",
                                 "pre_filter": ["x"],
                                 "post_filter":
                                 ["isinstance(x, basestring)"],
                                 "post_filter_s": "decode_value"
                             },
                             "extraction_policy": "keep_existing"
                         }
                     }
                 }
             }
         }]
     }
     c = Core(extraction_config=e_config)
     r = c.process(doc)
     self.assertEqual(r['knowledge_graph']['name'][0]['value'],
                      'Not Margie')

예제 #14

0

파일 보기

    def test_extract_as_is_post_filter_3(self):
        doc = {
            "uri":
            "1",
            "event_actors": [{
                "description": "Non-State, Internal, No State Sanction",
                "id": "internalnononstatesanctionstate",
                "size": "54"
            }, {
                "description": "Noncombatant Status Asserted",
                "id":
                "assertedcontestednoncombatantnoncombatantnotstatusstatus",
                "size": "34.0"
            }, {
                "description": "Noncombatant Status Asserted",
                "id":
                "assertedcontestednoncombatantnoncombatantnotstatusstatus",
                "size": "redme34"
            }]
        }

        e_config = {
            "extraction_policy":
            "replace",
            "error_handling":
            "raise_error",
            "document_id":
            "uri",
            "content_extraction": {
                "json_content": [{
                    "input_path": "event_actors[*].size",
                    "segment_name": "actor_size"
                }]
            },
            "data_extraction": [{
                "input_path": "content_extraction.actor_size[*].text.`parent`",
                "fields": {
                    "actor_size": {
                        "extractors": {
                            "extract_as_is": {
                                "extraction_policy": "keep_existing",
                                "config": {
                                    "post_filter": ["parse_number"]
                                }
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(doc)
        self.assertTrue(len(r['knowledge_graph']['actor_size']) == 2)
        self.assertEqual(r['knowledge_graph']['actor_size'][0]['value'], '54')
        self.assertEqual(r['knowledge_graph']['actor_size'][1]['value'],
                         '34.0')

예제 #15

0

파일 보기

파일: test_default_spacy.py 프로젝트: flydrt/etk

    def test_extraction_from_default_spacy(self):
        c = Core(extraction_config=self.e_config, load_spacy=True)
        for i in range(len(self.ground_truth_input)):

            r = c.process(self.ground_truth_input[i],
                          create_knowledge_graph=True,
                          html_description=False)

            self.assertEquals(self.ground_truth_output[i]['knowledge_graph'],
                              r['knowledge_graph'])

예제 #16

0

파일 보기

파일: test_extractions_from_table.py 프로젝트: vlall/etk

    def test_table_extractor(self):
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)
        with open("table_out.jl", "w") as f:
            f.write(json.dumps(r["content_extraction"]["table"]["tables"]))

        self.assertTrue("content_extraction" in r)
        self.assertTrue("table" in r["content_extraction"])
        ex = json.loads(json.JSONEncoder().encode(r["content_extraction"]["table"]["tables"]))
        self.assertEqual(ex, self.table_ex)

예제 #17

0

파일 보기

    def test_create_kg_node(self):
        doc = {
            "url": "http:www.hitman.org",
            "doc_id":
            "19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E",
            "actors": {
                "name": "agent 47",
                "affiliation": "International Contract Agency"
            }
        }

        e_config = {
            "document_id":
            "doc_id",
            "data_extraction": [{
                "input_path": ["actors"],
                "fields": {
                    "actors": {
                        "extractors": {
                            "create_kg_node_extractor": {
                                "config": {
                                    "segment_name": "actor_information"
                                }
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(doc)
        self.assertTrue('knowledge_graph' in doc)
        self.assertTrue('actors' in doc['knowledge_graph'])
        self.assertTrue(len(doc['knowledge_graph']['actors']) == 1)
        self.assertTrue('nested_docs' in r)
        self.assertTrue(len(r['nested_docs']) == 1)
        nested_doc = r['nested_docs'][0]
        ce_expected = {
            "actor_information": {
                "affiliation": "International Contract Agency",
                "name": "agent 47"
            }
        }

        self.assertEqual(nested_doc['content_extraction'], ce_expected)
        # self.assertTrue('parent_doc_id' in nested_doc)
        # self.assertEqual(nested_doc['parent_doc_id'],
        #                  '19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E')
        self.assertTrue('created_by' in nested_doc)
        self.assertTrue('@timestamp_created' in nested_doc)
        self.assertTrue('url' in nested_doc)

        self.assertEqual(
            r['knowledge_graph']['actors'][0]['provenance'][0]['qualifiers']
            ['timestamp_created'], nested_doc['@timestamp_created'])

예제 #18

0

파일 보기

    def test_negative_case_sensitive(self):
        women_name_file_path = os.path.join(
            os.path.dirname(__file__),
            "resources/case_sensitive_female_name.json")
        doc = {
            'content_extraction': {
                'content_strict': {
                    'text':
                    'My name is margie and this is a test for extracting this name using case sensitive '
                    'dictionary'
                }
            },
            'doc_id': 'id',
            'url': 'http://givemeabreak.com'
        }
        e_config = {
            "resources": {
                "dictionaries": {
                    "women_name": women_name_file_path
                }
            },
            "document_id":
            "doc_id",
            "data_extraction": [{
                "input_path":
                "content_extraction.content_strict.text.`parent`",
                "fields": {
                    "name": {
                        "extractors": {
                            "extract_using_dictionary": {
                                "config": {
                                    'case_sensitive': 'trUe',
                                    "dictionary": "women_name",
                                    "ngrams": 1,
                                    "joiner": " ",
                                    "pre_filter": ["x"],
                                    "post_filter":
                                    ["isinstance(x, basestring)"]
                                },
                                "extraction_policy": "keep_existing"
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(doc)

        self.assertTrue(
            "simple_tokens" in r["content_extraction"]["content_strict"])
        self.assertTrue('simple_tokens_original_case' in
                        r["content_extraction"]["content_strict"])
        self.assertTrue(
            "data_extraction" not in r["content_extraction"]["content_strict"])

예제 #19

0

파일 보기

파일: test_kg_enhancement.py 프로젝트: vlall/etk

    def test_guard_field_regex_pass(self):
        self.e_config['kg_enhancement']['fields']['name']['guard'] = [{
            "field": "fieldA",
            "regex": "ach"
        }]
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('name' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)

예제 #20

0

파일 보기

파일: test_kg_enhancement.py 프로젝트: vlall/etk

    def test_guard_field_stop_value_fail(self):
        self.e_config['kg_enhancement']['fields']['name']['guard'] = [{
            "field": "fieldA",
            "stop_value": "SACHIN"
        }]
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('name' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)

예제 #21

0

파일 보기

파일: test_kg_enhancement.py 프로젝트: vlall/etk

    def test_guard_url_fail(self):
        self.e_config['kg_enhancement']['fields']['name']['guard'] = [{
            "field": "url",
            "value": "http://www.testffffffurl.com"
        }]
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('name' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['name']) == 2)
        self.assertTrue(self.doc['knowledge_graph']['name'][0]['confidence'] == 1.0)

예제 #22

0

파일 보기

파일: test_default_spacy.py 프로젝트: vlall/etk

    def test_extraction_from_default_spacy(self):
        c = Core(extraction_config=self.e_config, load_spacy=True)
        dd = codecs.open('temp', 'w')
        for i in range(len(self.ground_truth_input)):

            r = c.process(self.ground_truth_input[i],
                          create_knowledge_graph=True,
                          html_description=False)
            # dd.write(json.dumps(r))
            # dd.write('\n')
            self.assertEquals(self.ground_truth_output[i]['knowledge_graph'],
                              r['knowledge_graph'])

예제 #23

0

파일 보기

    def test_extract_as_is_post_filter(self):
        doc = {
            "uri":
            "1",
            "event_actors": [{
                "description": "Non-State, Internal, No State Sanction",
                "id": "internalnononstatesanctionstate",
                "title": ""
            }, {
                "description": "Noncombatant Status Asserted",
                "id":
                "assertedcontestednoncombatantnoncombatantnotstatusstatus",
                "title": "Noncombatant Status Not Contested"
            }]
        }

        e_config = {
            "extraction_policy":
            "replace",
            "error_handling":
            "raise_error",
            "document_id":
            "uri",
            "content_extraction": {
                "json_content": [{
                    "input_path": "event_actors[*].title",
                    "segment_name": "actor_title"
                }]
            },
            "data_extraction": [{
                "input_path":
                "content_extraction.actor_title[*].text.`parent`",
                "fields": {
                    "actor_title": {
                        "extractors": {
                            "extract_as_is": {
                                "extraction_policy": "keep_existing",
                                "config": {
                                    "post_filter": ["x.upper()"]
                                }
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(doc)
        self.assertTrue('actor_title' in r['knowledge_graph'])
        self.assertTrue(len(r['knowledge_graph']['actor_title']) == 1)
        self.assertTrue(r['knowledge_graph']['actor_title'][0]['value'] ==
                        'noncombatant status not contested'.upper())

예제 #24

0

파일 보기

파일: test_kg_enhancement.py 프로젝트: vlall/etk

    def test_guard_field_value_pass(self):
        self.e_config['kg_enhancement']['fields']['name']['guard'] = [{
            "field": "fieldA",
            "value": "sachin"
        }]
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('name' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)
        self.assertTrue(self.doc['knowledge_graph']['name'][0]['confidence'] == 1.0)
        self.assertTrue(self.doc['knowledge_graph']['name'][0]['value'] == 'Aname')

예제 #25

0

파일 보기

파일: test_extraction_from_landmark.py 프로젝트: vlall/etk

    def test_extract_landmark_post_filter(self):
        e_config = {
            "document_id":
            "doc_id",
            "data_extraction": [{
                "input_path": ["*.inferlink_extractions.*.text.`parent`"],
                "fields": {
                    "phone": {
                        "extractors": {
                            "extract_from_landmark": {
                                "config": {
                                    "fields": ["inferlink_phone"],
                                    "post_filter": ["extract_phone"]
                                },
                                "extraction_policy": "replace"
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(self.doc)
        self.assertTrue("knowledge_graph" in r)
        ex_phone = r["knowledge_graph"]["phone"]

        expected_phone = [{
            "confidence":
            1,
            "provenance": [{
                "source": {
                    "extraction_metadata": {
                        "obfuscation": "False"
                    },
                    "segment":
                    "html",
                    "document_id":
                    "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                },
                "confidence": {
                    "extraction": 1.0
                },
                "method": "inferlink",
                "extracted_value": "3234522013"
            }],
            "key":
            "3234522013",
            "value":
            "3234522013"
        }]
        self.assertEqual(ex_phone, expected_phone)

예제 #26

0

파일 보기

    def test_landmark_with_field_name(self):
        rules_file_path = os.path.join(os.path.dirname(__file__),
                                       "resources/consolidated_rules.json")
        e_config = {
            "resources": {
                "landmark": [rules_file_path]
            },
            'content_extraction': {
                "input_path": "raw_content",
                "extractors": {
                    "landmark": {
                        "field_name": "inferlink_extractions",
                        "extraction_policy": "keep_existing",
                        "landmark_threshold": 0.5
                    }
                }
            }
        }
        c = Core(extraction_config=e_config)
        r = c.process(self.doc)
        self.assertTrue("content_extraction" in r)
        self.assertTrue("inferlink_extractions" in r["content_extraction"])
        self.assertTrue(
            len(r["content_extraction"]["inferlink_extractions"].keys()) > 0)

        self.assertTrue("content_strict" not in r["content_extraction"])
        self.assertTrue("content_relaxed" not in r["content_extraction"])
        self.assertTrue("title" not in r["content_extraction"])

        ifl_extractions = {
            "inferlink_location": {
                "text": "Los Angeles, California"
            },
            "inferlink_age": {
                "text": "23"
            },
            "inferlink_phone": {
                "text": "323-452-2013"
            },
            "inferlink_description": {
                "text":
                "Hey I'm luna 3234522013 Let's explore , embrace and indulge in your favorite fantasy % independent. discreet no drama Firm Thighs and Sexy. My Soft skin & Tight Grip is exactly what you deserve Call or text Fetish friendly Fantasy friendly Party friendly 140 Hr SPECIALS 3234522013\n"
            },
            "inferlink_posting-date": {
                "text": "2017-01-02 06:46"
            }
        }

        self.assertEqual(r["content_extraction"]["inferlink_extractions"],
                         ifl_extractions)

예제 #27

0

파일 보기

파일: test_content_extractions.py 프로젝트: rahulrawat11/etk

 def test_landmark_no_resources(self):
     e_config = {'content_extraction': {
         "input_path": "raw_content",
         "extractors": {
             "landmark": {
                 "field_name": "inferlink_extractions",
                 "extraction_policy": "keep_existing",
                 "landmark_threshold": 0.5
                 }
              }
            }
          }
     c = Core(extraction_config=e_config)
     with self.assertRaises(KeyError):
         r = c.process(self.doc)

예제 #28

0

파일 보기

    def test_table_extractor_empty_config(self):
        e_config = {
            'content_extraction': {
                "input_path": "raw_content",
                "extractors": {
                    "table": {}
                }
            }
        }
        c = Core(extraction_config=e_config)
        r = c.process(self.doc)

        self.assertTrue("content_extraction" in r)
        self.assertTrue("table" in r["content_extraction"])
        ex = json.loads(json.JSONEncoder().encode(
            r["content_extraction"]["table"]))
        self.assertEqual(ex, self.table_ex)

예제 #29

0

파일 보기

파일: test_extraction_from_landmark.py 프로젝트: rahulrawat11/etk

    def test_extract_landmark_post_filter(self):
        e_config = {
            "data_extraction": [{
                "input_path": ["*.inferlink_extractions.*.text.`parent`"],
                "fields": {
                    "phone": {
                        "extractors": {
                            "extract_from_landmark": {
                                "config": {
                                    "fields": ["inferlink_phone"],
                                    "post_filter": ["extract_phone"]
                                },
                                "extraction_policy": "replace"
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(self.doc)
        inferlink_extractions = r["content_extraction"][
            "inferlink_extractions"]

        self.assertTrue(
            "data_extraction" in inferlink_extractions["inferlink_phone"])
        self.assertTrue("phone" in inferlink_extractions["inferlink_phone"]
                        ["data_extraction"])
        self.assertTrue(
            "extract_from_landmark" in inferlink_extractions["inferlink_phone"]
            ["data_extraction"]["phone"])
        ex_phone = inferlink_extractions["inferlink_phone"]["data_extraction"][
            "phone"]["extract_from_landmark"]

        expected_phone = {
            'results': [{
                'origin': {
                    'score': 1.0,
                    'segment': 'html',
                    'method': 'inferlink'
                },
                'obfuscation': 'False',
                'value': '3234522013'
            }]
        }
        self.assertEqual(ex_phone, expected_phone)

예제 #30

0

파일 보기

    def test_extractor_dictionary_no_resources(self):
        e_config = {
            "resources": {
                "dictionaries": {
                }
            },
            "data_extraction": [
                {
                    "input_path": "content_extraction.content_strict.text.`parent`"
                    ,
                    "fields": {
                        "name": {
                            "extractors": {
                                "extract_using_dictionary": {
                                    "config": {
                                        "dictionary": "women_name",
                                        "ngrams": 1,
                                        "joiner": " ",
                                        "pre_process": [
                                            "x.lower()"
                                        ],
                                        "pre_filter": [
                                            "x"
                                        ],
                                        "post_filter": [
                                            "isinstance(x, basestring)"
                                        ]
                                    },
                                    "extraction_policy": "keep_existing"
                                }
                            }

                        }
                    }
                }
            ]
        }
        c = Core(extraction_config=e_config)
        with self.assertRaises(KeyError):
            r = c.process(self.doc)
            self.assertTrue("content_extraction" in r)
            self.assertTrue("content_strict" in r["content_extraction"])
            self.assertTrue("text" in r["content_extraction"]["content_strict"])
            self.assertTrue("tokens" in r["content_extraction"]["content_strict"])
            self.assertTrue("simple_tokens" in r["content_extraction"]["content_strict"])