def test_gender_extractor(self):
        doc = {'content': 'female ts male gender', 'b': 'world'}

        extractor = GenderExtractor().set_metadata({'extractor': 'gender'})
        extractor_processor = ExtractorProcessor().set_input_fields(['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertEqual(updated_doc['extracted']['value'], 'transgender')
Exemplo n.º 2
0
 def test_multiple_list_inputs_matching_multiple_extractions_unioned(self):
     doc = {'a': ['hello', 'goodbye'], 'b': ['world', 'cup']}
     e1 = SampleSingleRenamedFieldExtractor()
     ep1 = ExtractorProcessor().set_input_fields('a')\
                               .set_output_fields('hp.word')\
                               .set_extractor(e1).set_name("h")
     ep2 = ExtractorProcessor().set_input_fields('b')\
                               .set_output_fields('hr.word')\
                               .set_extractor(e1).set_name("w")
     e2 = SampleFlatMappedSingleRenamedFieldExtractor()
     ep3 = ExtractorProcessor().set_extractor_processor_inputs([[ep1,
                                                                 ep2]])\
                               .set_output_fields('hhhhh.word')\
                               .set_extractor(e2).set_name("m")\
                               .set_flat_map_inputs(True)
     updated_doc = execute_processor_chain(doc, [ep1, ep2])
     updated_doc = execute_processor_chain(updated_doc, [ep3])
     self.assertEqual(updated_doc['hhhhh']['word'][0]['result']['value'],
                      "hellogoodbyeworldcup")
     e3 = SampleFlatMappedMultipleRenamedFieldExtractor()
     ep4 = ExtractorProcessor().set_extractor_processor_inputs([[ep1, ep3],
                                                                [ep2, ep3]])\
                               .set_output_fields('iiiii.word')\
                               .set_extractor(e3).set_name("m")\
                               .set_flat_map_inputs(True)
     updated_doc = execute_processor_chain(updated_doc, [ep4])
     self.assertEqual(
         updated_doc['iiiii']['word'][0]['result']['value'],
         "hellogoodbyehellogoodbyeworldcupworldcuphellogoodbyeworldcup")
    def test_ngrams_words_name_extractor(self):
        names = self.load_file()
        t = populate_trie(map(lambda x: x.lower(), names))
        self.assertTrue(isinstance(t.get('barbara'), basestring))
        self.assertFalse(isinstance(t.get('bar'), basestring))

        doc = {
            "foo": [
                "at", "the", "market", "jean", "marie", "bought", "a", "loaf",
                "of", "bread"
            ]
        }
        e = get_name_dictionary_extractor(t)
        e.set_ngrams(2)
        e.set_joiner(' ')
        ep = ExtractorProcessor().set_input_fields('foo').set_output_field(
            'names').set_extractor(e)

        updated_doc = ep.extract(doc)
        self.assertEquals(updated_doc['names'][0]['result'][0]['value'],
                          'jean')
        self.assertEquals(updated_doc['names'][0]['result'][1]['value'],
                          'marie')
        self.assertEquals(updated_doc['names'][0]['result'][2]['value'],
                          'jean marie')
Exemplo n.º 4
0
    def test_height_weight_extractor_separated(self):
        doc = {
            'content':
            "\n TS RUBI: THE NAME SAYS IT ALL!  \n INCALL $250 OUTCALL $350 \n \n \n \n \n \n Gender \n Age \n Ethnicity \n Hair Color \n Eye Color \n Height \n Weight \n Measurements \n Affiliation \n Availability \n Available To \n \n \n \n \n Transsexual \n 27 \n Latino/Hispanic \n Brown \n Hazel \n 5'5\" \n 130 lb \n 34C - 28\" - 34\" \n ",
            'b': 'world'
        }

        extractor = HeightWeightExtractor().set_metadata(
            {'extractor': 'height_weight'})
        ep = ExtractorProcessor().set_input_fields(['content'])\
                                 .set_output_fields(['height', 'weight'])\
                                 .set_extractor(extractor)
        updated_doc = ep.extract(doc)

        self.assertEqual(
            updated_doc['height'][0]['result']['value'], {
                'foot': ['5\'5"'],
                'raw': [{
                    'foot': 5,
                    'inch': 5
                }],
                'centimeter': [165]
            })
        self.assertEqual(updated_doc['weight'][0]['result']['value'], {
            'raw': [{
                'pound': 130
            }],
            'pound': [130],
            'kilogram': [58]
        })
    def test_drug_use_extractor_with_context(self):
        doc = {'content': 'she is on drugs a dope w****s and a junkie who is on some heavy drugs and does drugs and an adict, smells bad', 'b': 'world'}

        extractor = DrugUseExtractor().set_metadata({'extractor': 'drug'})
        extractor.set_include_context(True)
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'dope w****s')
        self.assertEqual(result[0]['context']['start'], 18)
        self.assertEqual(result[0]['context']['end'], 29)
        self.assertEqual(result[1]['value'], 'junkie')
        self.assertEqual(result[1]['context']['start'], 36)
        self.assertEqual(result[1]['context']['end'], 42)
        self.assertEqual(result[2]['value'], 'adict')
        self.assertEqual(result[2]['context']['start'], 92)
        self.assertEqual(result[2]['context']['end'], 97)
        self.assertEqual(result[3]['value'], 'on drugs')
        self.assertEqual(result[3]['context']['start'], 7)
        self.assertEqual(result[3]['context']['end'], 15)
        self.assertEqual(result[4]['value'], 'on some heavy drugs')
        self.assertEqual(result[4]['context']['start'], 50)
        self.assertEqual(result[4]['context']['end'], 69)
        self.assertEqual(result[5]['value'], 'does drugs')
        self.assertEqual(result[5]['context']['start'], 74)
        self.assertEqual(result[5]['context']['end'], 84)
Exemplo n.º 6
0
    def test_nationality_type_extractor_context(self):
        doc = {
            'content': [
                'American', 'Antiguans', 'Panamanian', 'Hello', 'World',
                'East', 'Timorese', 'kittian', 'and', 'nevisian', 'End'
            ],
            'b':
            'world'
        }

        extractor = get_nationality_extractor().set_metadata({
            'extractor':
            'nationality'
        }).set_include_context(True)
        ep = ExtractorProcessor().set_input_fields('content').set_output_field(
            'extracted').set_extractor(extractor)
        updated_doc = ep.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'american')
        self.assertEqual(result[0]['context']['start'], 0)
        self.assertEqual(result[0]['context']['end'], 1)
        self.assertEqual(result[1]['value'], 'antiguans')
        self.assertEqual(result[1]['context']['start'], 1)
        self.assertEqual(result[1]['context']['end'], 2)
        self.assertEqual(result[2]['value'], 'panamanian')
        self.assertEqual(result[2]['context']['start'], 2)
        self.assertEqual(result[2]['context']['end'], 3)
        self.assertEqual(result[3]['value'], 'east timorese')
        self.assertEqual(result[3]['context']['start'], 5)
        self.assertEqual(result[3]['context']['end'], 7)
        self.assertEqual(result[4]['value'], 'kittian and nevisian')
        self.assertEqual(result[4]['context']['start'], 7)
        self.assertEqual(result[4]['context']['end'], 10)
Exemplo n.º 7
0
    def test_context_extractor(self):
        doc = {'a': '', 'b': ['borscht', 'Bourne', 'barn', 'Block']}
        e1 = SampleContextExtractor()
        ep1 = ExtractorProcessor().set_input_fields('b')\
                                  .set_output_fields('f')\
                                  .set_extractor(e1).set_name("mo")
        updated_doc = execute_processor_chain(doc, [ep1])

        self.assertEqual(updated_doc['f'][0]['result'][0]['value'], 'Bourne')
        self.assertEqual(updated_doc['f'][0]['result'][0]['context']['start'],
                         1)
        self.assertEqual(updated_doc['f'][0]['result'][0]['context']['end'], 2)
        self.assertEqual(updated_doc['f'][0]['result'][1]['value'], 'Block')
        self.assertEqual(updated_doc['f'][0]['result'][1]['context']['start'],
                         3)
        self.assertEqual(updated_doc['f'][0]['result'][1]['context']['end'], 4)
        self.assertEqual(updated_doc['a'], '')
        self.assertEqual(updated_doc['b'],
                         ['borscht', 'Bourne', 'barn', 'Block'])

        e2 = SampleSingleRenamedFieldExtractor()
        ep2 = ExtractorProcessor().set_extractor_processor_inputs(ep1)\
                                  .set_output_fields('g')\
                                  .set_extractor(e2).set_name("no")
        updated_doc2 = execute_processor_chain(updated_doc, [ep2])
        self.assertEqual(updated_doc['g'][0]['result']['value'], 'Bourne')
    def test_empty_price_extractor(self):
        doc = {'content': 'something unrelated', 'b': 'world'}

        extractor = PriceExtractor().set_metadata({'extractor': 'price'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertNotIn('extracted', updated_doc)
Exemplo n.º 9
0
 def test_state_extractor(self):
     doc = {'content': ["", "ID#182730", "-", "florida", "LegendaryDave", "-", "Salt", "Lake", "-", "Gay", "Escorts", "&", "Gay", "Massage", "", "LegendaryDave", "2120.5", "Miles", "Away", "THE", "LEGENDARY", "DAVE-", "1ST", "TIME", "IN", "SALT", "LAKE", "CITY", "Over", "200", "Reviews!", "***Voted", "one", "of", "the", "Top", "50", "Escorts", "in", "the", "USA,", "for", "the", "last", "5", "years", "in", "a", "row***", "Secure,", "Masculine,", "Adventurous.", "Top/Vers", "Stud.", "Teacher", "to", "the", "New/Curious.", "Good-looking,", "Bi,", "Sense", "of", "Humor,", "Fun.", "Great", "Body.", "Educated,", "Honest,", "Sane.", "Healthy/DDF/Neg.", "AM,PM,", "24/7,", "Overnights,", "Multi-Days.", "Very", "Skilled:", "Massage", "or", "Vanilla", "or", "Kink", "or", "Wild.", "Discreet", "&", "Professional.", "Incalls", "&", "Outcalls:", "USA", "&", "Abroad.", "VISIT", "MY", "WEBSITE", "FOR", "RATES,", "MORE", "HOT", "PICS,", "ALL", "MY", "REVIEWS,", "HIRING", "TIPS", "&", "MORE.", "his", "stats", "AGE:", "45", "Role:", "Versatile", "HEIGHT:", "5'11\"", "(180cm)", "WEIGHT:", "150", "-", "170", "lbs", "(68", "-", "77", "kg)", "Piercings:", "Not", "Specified", "RACE:", "White", "HAIR", "COLOR:", "Dark", "Brown", "EYE", "COLOR:", "Green", "Open", "To", "LTR", ":", "Yes", "Languages:", "English,", "Other", "BODY", "TYPE:", "Muscular/Buff", "BODY", "HAIR:", "Moderately", "hairy", "Tattoos:", "Not", "Specified", "Smoker:", "Not", "Specified", "reviews", "\r", "M4RN", "Reviews:", "8", "Most", "Recent", "M4RN", "Star", "Review:", "04/30/2016", "", "services", "provided", "Escort", "yes", "HOT", "Massage", "yes", "Massage", "yes", "Registered", "Therapist", "yes", "In", "Calls", "yes", "Out", "Calls", "yes", "US", "Travel", "yes", "Int'l", "Travel", "yes", "Advertiser", "Since", "Dec", "'07", "contact", "info", "Phone:", "PREFERS", "PHONE", "CONTACT", "location", "2120.5", "Miles", "area:", "Salt", "Lake", "City", "/", "Ogden", "Local", "City:", "Salt", "Lake", "Postal", "Code:", "84101", "availability", "", "", "s", "m", "t", "w", "t", "f", "s", "7am-11am", "", "11am-3pm", "", "3pm-7pm", "", "7pm-11pm", "", "11pm-3am", "", "3am-7am", "", "", "Elite", "&", "Platinum", "Advertisers", "ELITE", "Toronto", "Barcelona", "San", "Francisco", "/", "Oakland", "Cleveland", "/", "Lorain", "/", "Elyria", "Los", "Angeles", "/", "West", "Hollywood", "Find", "LegendaryDave,", "Rent", "Men", "and", "Male", "Massage", "in", "Salt", "Lake", "", ""]}
     states = populate_trie(map(lambda x: x.lower(), self.load_file("states.json")))
     stop_words = populate_trie(map(lambda x: x.lower(), self.load_file("stop_words.json")))
     extractor = get_city_dictionary_extractor(states, stop_words)
     extractor_processor = ExtractorProcessor().set_input_fields(['content']).set_output_field('state').set_extractor(extractor)
     updated_doc = extractor_processor.extract(doc)
     self.assertEqual(updated_doc['state'][0]['result'][0]['value'], u'florida')
    def test_age_extractor(self):
        doc = {'content': ['FOV', 'HELLO', 'WORLD'], 'b': 'world'}

        extractor = get_service_extractor().set_metadata({'extractor': 'service'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            'content').set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'fov')
Exemplo n.º 11
0
    def test_nested_field_filtered_extractor(self):
        doc = {'a': [{'b': 'world', 'c': 'good'}, {'b': 'cup', 'c': 'bad'}]}
        e = SampleSingleRenamedFieldExtractor()
        ep = ExtractorProcessor().set_input_fields('a[?c=good].b')\
                                 .set_output_field('e')\
                                 .set_extractor(e)
        updated_doc = ep.extract(doc)

        self.assertEqual(updated_doc['e'][0]['result']['value'], 'world')
Exemplo n.º 12
0
    def test_city_extractor(self):
        doc = {'content': ["orlando", "Teacher", "to", "the", "New/Curious.", "Good-looking,", "Bi,", "Sense", "of", "Humor,", "Fun.", "Great", "Body.", "Educated,", "Honest,", "Sane.", "Healthy/DDF/Neg.", "AM,PM,", "24/7,", "Overnights,", "Multi-Days.", "Very", "Skilled:", "Massage", "or", "Vanilla", "or", "Kink", "or", "Wild.", "Discreet", "&", "Professional.", "Incalls", "&", "Outcalls:Abroad.", "VISIT", "MY", "WEBSITE", "FOR", "RATES,", "MORE", "HOT", "PICS,", "ALL", "MY", "REVIEWS,", "HIRING", "TIPS", "&", "MORE.", "", "", "", "", "", "his", "stats", "", "", "", "", "AGE:", "", "45", "", "Role:", "", "", "Versatile", "", "HEIGHT:", "", "5'11\"", "(180cm)", "", "WEIGHT:", "", "150", "-", "170", "lbs", "(68", "-", "77", "kg)", "", "Piercings:", "", "Not", "Specified", "", "", "", "RACE:", "", "White", "", "HAIR", "COLOR:", "", "Dark", "Brown", "", "EYE", "COLOR:", "", "Green", "", "Open", "To", "", "LTR", ":", "", "Yes", "", "Languages:", "", "English,", "Other", "", "", "", "BODY", "TYPE:", "", "Muscular/Buff", "", "BODY", "HAIR:", "", "Moderately", "hairy", "", "Tattoos:", "", "Not", "Specified", "", "Smoker:", "", "Not", "Specified", "", "", "", "", "", "", "", "reviews", "", "", "\r", "", "M4RN", "Reviews:", "", "8", "", "Most", "Recent", "M4RN", "Star", "Review:", "", "04/30/2016", "", "", "", "", "", "", "", "", "", "", "services", "provided", "", "", "", "Escort", "", "yes", "", "HOT", "Massage", "", "yes", "", "Massage", "", "yes", "", "Registered", "Therapi"]}
        cities = populate_trie(map(lambda x: x.lower(), self.load_file("cities.json")))
        stop_words = populate_trie(map(lambda x: x.lower(), self.load_file("stop_words.json")))

        extractor = get_city_dictionary_extractor(cities, stop_words)
        extractor_processor = ExtractorProcessor().set_input_fields(['content']).set_output_field('cities').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertEqual(updated_doc['cities'][0]['result'][0]['value'], u'orlando') 
Exemplo n.º 13
0
    def test_posting_extractor(self):
        doc = {'content': 'online: Jul 07, 00:44 \n  ', 'b': 'world'}

        extractor = PostingDateExtractor().set_metadata(
            {'extractor': 'posting_date'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertEqual(updated_doc['extracted'][0]['result']['value'],
                         '2016-07-07')
Exemplo n.º 14
0
    def test_ad_classifier(self):
        embeddings = self.load_embeddings(UNIGRAM_FILE)
        doc = {"readability_text": "Massage in London SW1 | victoriatantric.co.uk | Tantric I added my business 'Voluptas Tantric Massage' to CityLocal, the premier business directory in Westminster If you'...  http://t.co/QYKzbNYgOY Voluptas Tantric Massage in Victoria is a great place to spend and hour or two. Luxury Apartment for that special London massage. Tantric massage London SW1"}
        extractor = digAdsClassifier.ads_classifier.AdsClassifier()
        extractor.set_embeddings(embeddings)
        extractor_processor = ExtractorProcessor().set_input_fields('readability_text').set_output_field('ad_type').set_extractor(extractor)

        updated_doc = extractor_processor.extract(doc)

        self.assertEquals(updated_doc['ad_type'][0]['value'], 'massage-parlor')
    def test_drug_use_extractor(self):
        doc = {'content': 'she is on drugs a dope w****s and a junkie who is on some heavy drugs and does drugs and an adict, smells bad', 'b': 'world'}

        extractor = DrugUseExtractor().set_metadata({'extractor': 'drug'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertEqual(updated_doc['extracted'][0]['result'][0]['value'],
                         ['dope w****s', 'junkie', 'adict', 'on drugs',
                         'on some heavy drugs', 'does drugs'])
Exemplo n.º 16
0
    def test_single_renamed_field_missing_extractor(self):
        doc = {'b': 'world'}
        e = SampleSingleRenamedFieldExtractor()
        ep = ExtractorProcessor().set_input_fields('a')\
                                 .set_output_field('e')\
                                 .set_extractor(e)
        updated_doc = ep.extract(doc)

        self.assertTrue('e' not in updated_doc)
        self.assertEqual(updated_doc['b'], 'world')
Exemplo n.º 17
0
    def test_multiple_renamed_field_extractor(self):
        doc = {'a': 'hello', 'b': 'world'}
        e = SampleMultipleRenamedFieldExtractor()
        ep = ExtractorProcessor().set_input_fields(['a', 'b'])\
                                 .set_output_field('e')\
                                 .set_extractor(e)
        updated_doc = ep.extract(doc)

        self.assertEqual(updated_doc['e'][0]['result']['value'], 'helloworld')
        self.assertEqual(updated_doc['a'], 'hello')
        self.assertEqual(updated_doc['b'], 'world')
Exemplo n.º 18
0
    def test_country_predictory_extractor(self):
        city_to_country = self.load_file("city_to_country.json")
        doc = {
            "cities": ["seattle", "san francisco", "bogota", "minneapolis"]
        }  # load ad as dictionary
        e = CountryPredictorExtractor().set_city_to_country(city_to_country)
        ep = ExtractorProcessor().set_input_fields('cities').set_output_field(
            'country').set_extractor(e)

        updated_doc = ep.extract(doc)
        self.assertEquals(updated_doc['country'][0]['value'], list(['usa']))
Exemplo n.º 19
0
 def test_readability_extractor(self):
     dig_html = self.load_file("dig.html")
     dig_text = self.load_file("dig.txt")
     doc = {"foo": dig_html}
     e = ReadabilityExtractor()
     ep = ExtractorProcessor().set_input_fields('foo')\
                              .set_output_field('extracted')\
                              .set_extractor(e)
     updated_doc = ep.extract(doc)
     self.assertEquals(updated_doc['extracted'][0]['result']['value'],
                       dig_text)
    def test_price_extractor(self):
        doc = {'content': 'Good morning I\'m doing incalls only gentleman I\'m quick 60 roses ?Hhr 80 roses ?Hour 120 roses unrushed and f.service provided nonnegotiable donations  614-563-3342', 'b': 'world'}

        extractor = PriceExtractor().set_metadata({'extractor': 'price'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)

        # print updated_doc['extracted'][0]['result']['value']

        self.assertEqual(updated_doc['extracted'][0]['result']['value'], {'price': [{'price': '60', 'price_unit': 'rose', 'time_unit': 'hhr'}, {'price': '80', 'price_unit': 'rose', 'time_unit': 'hour'}, {'price': '120', 'price_unit': 'rose', 'time_unit': ''}], 'price_per_hour': '80'})
Exemplo n.º 21
0
    def test_empty_tokens(self):
        doc = {}
        extractor = socialmedia_id_extractor.SocialMediaIdExtractor()
        extractor_processor = ExtractorProcessor().set_input_fields(
            'tokens').set_output_field('social_media_ids').set_extractor(
                extractor)

        updated_doc = extractor_processor.extract(doc)

        #print updated_doc

        self.assertEquals(updated_doc, {})
    def test_age_regex_extractor(self):
        doc = {'content': "32years old ,im 23",
               'b': 'world'}

        extractor = get_age_regex_extractor()
        extractor_processor = ExtractorProcessor().set_input_fields(
            'content').set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)

        result1 = updated_doc['extracted'][0]['result'][0]
        result2 = updated_doc['extracted'][0]['result'][1]
        self.assertEqual(result1['value'], '32')
        self.assertEqual(result2['value'], '23')
Exemplo n.º 23
0
    def test_height_weight_extractor_empty(self):
        doc = {
            'content':
            "\n TS RUBI: THE NAME SAYS IT ALL!  \n INCALL $250 OUTCALL $350 \n \n \n \n \n \n Gender \n Age \n Ethnicity \n Hair Color \n Eye Color \n Height \n Weight \n Measurements \n Affiliation \n Availability \n Available To \n \n \n \n \n Transsexual \n 27 \n Latino/Hispanic \n Brown \n Hazel \n 34C - 28\" - 34\" \n ",
            'b': 'world'
        }

        extractor = HeightWeightExtractor().set_metadata(
            {'extractor': 'height_weight'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)

        self.assertNotIn("extracted", updated_doc)
Exemplo n.º 24
0
    def test_cve_extractor(self):
        doc = {
            'text':
            'Sample cves are CVE-1993-1344 ascasdsadfjskdvnjdvn CVE-2006-1232'
        }

        extractor = CveExtractor().set_metadata({'extractor': 'cve'})
        ep = ExtractorProcessor().set_input_fields(['text'])\
                                 .set_output_field('extracted')\
                                 .set_extractor(extractor)
        updated_doc = ep.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'CVE-1993-1344')
        self.assertEqual(result[1]['value'], 'CVE-2006-1232')
    def test_name_extractor(self):
        names = self.load_file()
        t = populate_trie(map(lambda x: x.lower(), names))
        self.assertTrue(isinstance(t.get('barbara'), basestring))
        self.assertFalse(isinstance(t.get('bar'), basestring))

        doc = {"foo": ['bar', 'Barbara']}
        e = get_name_dictionary_extractor(t)
        ep = ExtractorProcessor().set_input_fields('foo').set_output_field(
            'names').set_extractor(e)

        updated_doc = ep.extract(doc)
        self.assertEquals(updated_doc['names'][0]['result'][0]['value'],
                          'barbara')
Exemplo n.º 26
0
 def test_single_input_matching_multiple_extractions(self):
     doc = {'a': 'hello', 'b': 'world'}
     e1 = SampleSingleRenamedFieldExtractor()
     ep1 = ExtractorProcessor().set_input_fields('a')\
                               .set_output_fields('hp.word')\
                               .set_extractor(e1).set_name("h")
     e2 = SampleFlatMappedSingleRenamedFieldExtractor()
     ep2 = ExtractorProcessor().set_input_fields(['hp.word[*].result.value'])\
                               .set_output_fields('hhhhh.word')\
                               .set_extractor(e2).set_name("m")\
                               .set_flat_map_inputs(True)
     updated_doc = execute_processor_chain(doc, [ep1, ep1, ep2])
     self.assertEqual(updated_doc['hhhhh']['word'][0]['result']['value'],
                      "hellohello")
    def test_age_extractor_context(self):
        doc = {'content': ['FOV', 'HELLO', 'WORLD', 'hot', 'towel', 'treatment', 'other'], 'b': 'world'}

        extractor = get_service_extractor().set_metadata({'extractor': 'service'})
        extractor.set_include_context(True)
        extractor_processor = ExtractorProcessor().set_input_fields(
            'content').set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'fov')
        self.assertEqual(result[0]['context']['start'], 0)
        self.assertEqual(result[0]['context']['end'], 1)
        self.assertEqual(result[1]['value'], 'hot towel treatment')
        self.assertEqual(result[1]['context']['start'], 3)
        self.assertEqual(result[1]['context']['end'], 6)
Exemplo n.º 28
0
    def test_nationality_type_extractor(self):
        doc = {
            'content':
            ['American', 'Antiguans', 'Panamanian', 'Hello', 'World'],
            'b': 'world'
        }

        extractor = get_nationality_extractor().set_metadata(
            {'extractor': 'nationality'})
        ep = ExtractorProcessor().set_input_fields('content').set_output_field(
            'extracted').set_extractor(extractor)
        updated_doc = ep.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'american')
        self.assertEqual(result[1]['value'], 'antiguans')
        self.assertEqual(result[2]['value'], 'panamanian')
def tokenize(text, method='dig'):
    tokens = list()

    if (method == 'nltk'):
        for s in sent_tokenize(text):
            word_tokens += word_tokenize(s)

    elif (method == 'dig'):
        doc = {'string': text}
        e = TokenizerExtractor()
        ep = ExtractorProcessor().set_input_fields('string').set_output_field(
            'output').set_extractor(e)
        updated_doc = ep.extract(doc)
        word_tokens = updated_doc['output'][0]['result'][0]['value']

    return word_tokens
Exemplo n.º 30
0
    def test_single_renamed_field_multiple_nested_outputs_list_extractor(self):
        doc = {'a': 'hello', 'b': 'world'}
        e1 = SampleSingleRenamedFieldMultipleOutputsExtractor()
        e2 = SampleSingleRenamedFieldExtractor()
        ep1 = ExtractorProcessor().set_input_fields('a')\
                                  .set_output_fields({'f': 'f.a', 'g': 'g.a'})\
                                  .set_extractor(e1).set_name("mo")
        ep2 = ExtractorProcessor().set_extractor_processor_inputs(ep1, 'f.a')\
                                  .set_output_fields('j.a')\
                                  .set_extractor(e2).set_name("so")
        updated_doc = execute_processor_chain(doc, [ep1, ep2])

        self.assertEqual(updated_doc['f']['a'][0]['result']['value'], 'hello')
        self.assertEqual(updated_doc['g']['a'][0]['result']['value'], 'hello')
        self.assertEqual(updated_doc['j']['a'][0]['result']['value'], 'hello')
        self.assertEqual(updated_doc['a'], 'hello')
        self.assertEqual(updated_doc['b'], 'world')
Exemplo n.º 31
0
    def test_height_weight_extractor(self):
        extractor = HeightWeightExtractor().set_metadata(
            {'extractor': 'height_weight'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)

        for doc in self.groundtruth_data[:20]:
            updated_doc = extractor_processor.extract(doc)
            self.assertIn('extracted', updated_doc)
            self.assertTrue(len(updated_doc['extracted']) > 0)
            extraction = updated_doc['extracted'][0]['result']['value']

            if 'height' in doc:
                self.assertIn('height', extraction)
                self.assertEqual(extraction['height'], doc['height'])
            if 'weight' in doc:
                self.assertIn('weight', extraction)
                self.assertEqual(extraction['weight'], doc['weight'])
Exemplo n.º 32
0
    def _tokenize_field(obj, field, method='dig'):
        """
        At present, we'll deal with only one field (e.g. readability_text). The field could be a unicode
        or a list, so make sure to take both into account.

        We are not preprocessing the tokens in any way. For this, I'll write another function.
        :param obj: the adultservice json object
        :param field: e.g. 'readability_text'
        :return: A list of tokens.
        """

        word_tokens = list()

        if (method == 'nltk'):
            list_of_sentences = list()

            if field not in obj:
                return None
            elif type(obj[field]) == list:
                k = list()
                k.append(obj[field])
                list_of_sentences += k
            else:
                tmp = list()
                tmp.append(obj[field])
                k = list()
                k.append(tmp)
                # print k
                list_of_sentences += k  # we are assuming this is a unicode/string
            for sentences in list_of_sentences:
                # print sentences
                for sentence in sentences:
                    for s in sent_tokenize(sentence):
                        word_tokens += word_tokenize(s)

        elif (method == 'dig'):
            doc = {'string': obj[field]}
            e = TokenizerExtractor()
            ep = ExtractorProcessor().set_input_fields(
                'string').set_output_field('output').set_extractor(e)
            updated_doc = ep.extract(doc)
            word_tokens = updated_doc['output'][0]['result'][0]['value']

        return word_tokens
Exemplo n.º 33
0
    def test_age_extractor(self):
        doc = {
            'content': [
                "Poster's", "age", "26", "Location", "Orlando", "Post", "ID",
                "12295358", "Date", "June", "25", "2015"
            ],
            'b':
            'world'
        }

        extractor = get_age_dictionary_extractor()
        extractor_processor = ExtractorProcessor().set_input_fields(
            'content').set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)

        result1 = updated_doc['extracted'][0]['result'][0]
        result2 = updated_doc['extracted'][0]['result'][1]
        self.assertEqual(result1['value'], '26')
        self.assertEqual(result2['value'], '25')