def test_ngrams_words_name_extractor(self):
        names = self.load_file()
        t = populate_trie(map(lambda x: x.lower(), names))
        self.assertTrue(isinstance(t.get('barbara'), basestring))
        self.assertFalse(isinstance(t.get('bar'), basestring))

        doc = {
            "foo": [
                "at", "the", "market", "jean", "marie", "bought", "a", "loaf",
                "of", "bread"
            ]
        }
        e = get_name_dictionary_extractor(t)
        e.set_ngrams(2)
        e.set_joiner(' ')
        ep = ExtractorProcessor().set_input_fields('foo').set_output_field(
            'names').set_extractor(e)

        updated_doc = ep.extract(doc)
        self.assertEquals(updated_doc['names'][0]['result'][0]['value'],
                          'jean')
        self.assertEquals(updated_doc['names'][0]['result'][1]['value'],
                          'marie')
        self.assertEquals(updated_doc['names'][0]['result'][2]['value'],
                          'jean marie')
    def test_gender_extractor(self):
        doc = {'content': 'female ts male gender', 'b': 'world'}

        extractor = GenderExtractor().set_metadata({'extractor': 'gender'})
        extractor_processor = ExtractorProcessor().set_input_fields(['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertEqual(updated_doc['extracted']['value'], 'transgender')
    def test_drug_use_extractor_with_context(self):
        doc = {'content': 'she is on drugs a dope w****s and a junkie who is on some heavy drugs and does drugs and an adict, smells bad', 'b': 'world'}

        extractor = DrugUseExtractor().set_metadata({'extractor': 'drug'})
        extractor.set_include_context(True)
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'dope w****s')
        self.assertEqual(result[0]['context']['start'], 18)
        self.assertEqual(result[0]['context']['end'], 29)
        self.assertEqual(result[1]['value'], 'junkie')
        self.assertEqual(result[1]['context']['start'], 36)
        self.assertEqual(result[1]['context']['end'], 42)
        self.assertEqual(result[2]['value'], 'adict')
        self.assertEqual(result[2]['context']['start'], 92)
        self.assertEqual(result[2]['context']['end'], 97)
        self.assertEqual(result[3]['value'], 'on drugs')
        self.assertEqual(result[3]['context']['start'], 7)
        self.assertEqual(result[3]['context']['end'], 15)
        self.assertEqual(result[4]['value'], 'on some heavy drugs')
        self.assertEqual(result[4]['context']['start'], 50)
        self.assertEqual(result[4]['context']['end'], 69)
        self.assertEqual(result[5]['value'], 'does drugs')
        self.assertEqual(result[5]['context']['start'], 74)
        self.assertEqual(result[5]['context']['end'], 84)
예제 #4
0
    def test_nationality_type_extractor_context(self):
        doc = {
            'content': [
                'American', 'Antiguans', 'Panamanian', 'Hello', 'World',
                'East', 'Timorese', 'kittian', 'and', 'nevisian', 'End'
            ],
            'b':
            'world'
        }

        extractor = get_nationality_extractor().set_metadata({
            'extractor':
            'nationality'
        }).set_include_context(True)
        ep = ExtractorProcessor().set_input_fields('content').set_output_field(
            'extracted').set_extractor(extractor)
        updated_doc = ep.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'american')
        self.assertEqual(result[0]['context']['start'], 0)
        self.assertEqual(result[0]['context']['end'], 1)
        self.assertEqual(result[1]['value'], 'antiguans')
        self.assertEqual(result[1]['context']['start'], 1)
        self.assertEqual(result[1]['context']['end'], 2)
        self.assertEqual(result[2]['value'], 'panamanian')
        self.assertEqual(result[2]['context']['start'], 2)
        self.assertEqual(result[2]['context']['end'], 3)
        self.assertEqual(result[3]['value'], 'east timorese')
        self.assertEqual(result[3]['context']['start'], 5)
        self.assertEqual(result[3]['context']['end'], 7)
        self.assertEqual(result[4]['value'], 'kittian and nevisian')
        self.assertEqual(result[4]['context']['start'], 7)
        self.assertEqual(result[4]['context']['end'], 10)
예제 #5
0
    def test_height_weight_extractor_separated(self):
        doc = {
            'content':
            "\n TS RUBI: THE NAME SAYS IT ALL!  \n INCALL $250 OUTCALL $350 \n \n \n \n \n \n Gender \n Age \n Ethnicity \n Hair Color \n Eye Color \n Height \n Weight \n Measurements \n Affiliation \n Availability \n Available To \n \n \n \n \n Transsexual \n 27 \n Latino/Hispanic \n Brown \n Hazel \n 5'5\" \n 130 lb \n 34C - 28\" - 34\" \n ",
            'b': 'world'
        }

        extractor = HeightWeightExtractor().set_metadata(
            {'extractor': 'height_weight'})
        ep = ExtractorProcessor().set_input_fields(['content'])\
                                 .set_output_fields(['height', 'weight'])\
                                 .set_extractor(extractor)
        updated_doc = ep.extract(doc)

        self.assertEqual(
            updated_doc['height'][0]['result']['value'], {
                'foot': ['5\'5"'],
                'raw': [{
                    'foot': 5,
                    'inch': 5
                }],
                'centimeter': [165]
            })
        self.assertEqual(updated_doc['weight'][0]['result']['value'], {
            'raw': [{
                'pound': 130
            }],
            'pound': [130],
            'kilogram': [58]
        })
    def test_empty_price_extractor(self):
        doc = {'content': 'something unrelated', 'b': 'world'}

        extractor = PriceExtractor().set_metadata({'extractor': 'price'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertNotIn('extracted', updated_doc)
예제 #7
0
 def test_state_extractor(self):
     doc = {'content': ["", "ID#182730", "-", "florida", "LegendaryDave", "-", "Salt", "Lake", "-", "Gay", "Escorts", "&", "Gay", "Massage", "", "LegendaryDave", "2120.5", "Miles", "Away", "THE", "LEGENDARY", "DAVE-", "1ST", "TIME", "IN", "SALT", "LAKE", "CITY", "Over", "200", "Reviews!", "***Voted", "one", "of", "the", "Top", "50", "Escorts", "in", "the", "USA,", "for", "the", "last", "5", "years", "in", "a", "row***", "Secure,", "Masculine,", "Adventurous.", "Top/Vers", "Stud.", "Teacher", "to", "the", "New/Curious.", "Good-looking,", "Bi,", "Sense", "of", "Humor,", "Fun.", "Great", "Body.", "Educated,", "Honest,", "Sane.", "Healthy/DDF/Neg.", "AM,PM,", "24/7,", "Overnights,", "Multi-Days.", "Very", "Skilled:", "Massage", "or", "Vanilla", "or", "Kink", "or", "Wild.", "Discreet", "&", "Professional.", "Incalls", "&", "Outcalls:", "USA", "&", "Abroad.", "VISIT", "MY", "WEBSITE", "FOR", "RATES,", "MORE", "HOT", "PICS,", "ALL", "MY", "REVIEWS,", "HIRING", "TIPS", "&", "MORE.", "his", "stats", "AGE:", "45", "Role:", "Versatile", "HEIGHT:", "5'11\"", "(180cm)", "WEIGHT:", "150", "-", "170", "lbs", "(68", "-", "77", "kg)", "Piercings:", "Not", "Specified", "RACE:", "White", "HAIR", "COLOR:", "Dark", "Brown", "EYE", "COLOR:", "Green", "Open", "To", "LTR", ":", "Yes", "Languages:", "English,", "Other", "BODY", "TYPE:", "Muscular/Buff", "BODY", "HAIR:", "Moderately", "hairy", "Tattoos:", "Not", "Specified", "Smoker:", "Not", "Specified", "reviews", "\r", "M4RN", "Reviews:", "8", "Most", "Recent", "M4RN", "Star", "Review:", "04/30/2016", "", "services", "provided", "Escort", "yes", "HOT", "Massage", "yes", "Massage", "yes", "Registered", "Therapist", "yes", "In", "Calls", "yes", "Out", "Calls", "yes", "US", "Travel", "yes", "Int'l", "Travel", "yes", "Advertiser", "Since", "Dec", "'07", "contact", "info", "Phone:", "PREFERS", "PHONE", "CONTACT", "location", "2120.5", "Miles", "area:", "Salt", "Lake", "City", "/", "Ogden", "Local", "City:", "Salt", "Lake", "Postal", "Code:", "84101", "availability", "", "", "s", "m", "t", "w", "t", "f", "s", "7am-11am", "", "11am-3pm", "", "3pm-7pm", "", "7pm-11pm", "", "11pm-3am", "", "3am-7am", "", "", "Elite", "&", "Platinum", "Advertisers", "ELITE", "Toronto", "Barcelona", "San", "Francisco", "/", "Oakland", "Cleveland", "/", "Lorain", "/", "Elyria", "Los", "Angeles", "/", "West", "Hollywood", "Find", "LegendaryDave,", "Rent", "Men", "and", "Male", "Massage", "in", "Salt", "Lake", "", ""]}
     states = populate_trie(map(lambda x: x.lower(), self.load_file("states.json")))
     stop_words = populate_trie(map(lambda x: x.lower(), self.load_file("stop_words.json")))
     extractor = get_city_dictionary_extractor(states, stop_words)
     extractor_processor = ExtractorProcessor().set_input_fields(['content']).set_output_field('state').set_extractor(extractor)
     updated_doc = extractor_processor.extract(doc)
     self.assertEqual(updated_doc['state'][0]['result'][0]['value'], u'florida')
예제 #8
0
 def test_array_extractor(self):
     doc = {'a': 'my name is foo', 'b': 'world'}
     e = SampleSingleRenamedFieldExtractor()
     ep = ExtractorProcessor().set_input_fields('a')\
                              .set_output_fields('g')\
                              .set_extractor(e).set_name("no")
     updated_doc = ep.extract(doc)
     e2 = SampleSingleRenamedFieldExtractorArrayConverter()
     ep2 = ExtractorProcessor().set_extractor_processor_inputs(ep)\
                               .set_output_field('e')\
                               .set_extractor(e2)\
                               .set_name("oo")
     updated_doc = ep2.extract(updated_doc)
     ep3 = ExtractorProcessor().set_extractor_processor_inputs(ep)\
                               .set_output_fields('e')\
                               .set_extractor(e).set_name("po")
     updated_doc = ep3.extract(doc)
    def test_age_extractor(self):
        doc = {'content': ['FOV', 'HELLO', 'WORLD'], 'b': 'world'}

        extractor = get_service_extractor().set_metadata({'extractor': 'service'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            'content').set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'fov')
예제 #10
0
    def test_city_extractor(self):
        doc = {'content': ["orlando", "Teacher", "to", "the", "New/Curious.", "Good-looking,", "Bi,", "Sense", "of", "Humor,", "Fun.", "Great", "Body.", "Educated,", "Honest,", "Sane.", "Healthy/DDF/Neg.", "AM,PM,", "24/7,", "Overnights,", "Multi-Days.", "Very", "Skilled:", "Massage", "or", "Vanilla", "or", "Kink", "or", "Wild.", "Discreet", "&", "Professional.", "Incalls", "&", "Outcalls:Abroad.", "VISIT", "MY", "WEBSITE", "FOR", "RATES,", "MORE", "HOT", "PICS,", "ALL", "MY", "REVIEWS,", "HIRING", "TIPS", "&", "MORE.", "", "", "", "", "", "his", "stats", "", "", "", "", "AGE:", "", "45", "", "Role:", "", "", "Versatile", "", "HEIGHT:", "", "5'11\"", "(180cm)", "", "WEIGHT:", "", "150", "-", "170", "lbs", "(68", "-", "77", "kg)", "", "Piercings:", "", "Not", "Specified", "", "", "", "RACE:", "", "White", "", "HAIR", "COLOR:", "", "Dark", "Brown", "", "EYE", "COLOR:", "", "Green", "", "Open", "To", "", "LTR", ":", "", "Yes", "", "Languages:", "", "English,", "Other", "", "", "", "BODY", "TYPE:", "", "Muscular/Buff", "", "BODY", "HAIR:", "", "Moderately", "hairy", "", "Tattoos:", "", "Not", "Specified", "", "Smoker:", "", "Not", "Specified", "", "", "", "", "", "", "", "reviews", "", "", "\r", "", "M4RN", "Reviews:", "", "8", "", "Most", "Recent", "M4RN", "Star", "Review:", "", "04/30/2016", "", "", "", "", "", "", "", "", "", "", "services", "provided", "", "", "", "Escort", "", "yes", "", "HOT", "Massage", "", "yes", "", "Massage", "", "yes", "", "Registered", "Therapi"]}
        cities = populate_trie(map(lambda x: x.lower(), self.load_file("cities.json")))
        stop_words = populate_trie(map(lambda x: x.lower(), self.load_file("stop_words.json")))

        extractor = get_city_dictionary_extractor(cities, stop_words)
        extractor_processor = ExtractorProcessor().set_input_fields(['content']).set_output_field('cities').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertEqual(updated_doc['cities'][0]['result'][0]['value'], u'orlando') 
예제 #11
0
    def test_nested_field_filtered_extractor(self):
        doc = {'a': [{'b': 'world', 'c': 'good'}, {'b': 'cup', 'c': 'bad'}]}
        e = SampleSingleRenamedFieldExtractor()
        ep = ExtractorProcessor().set_input_fields('a[?c=good].b')\
                                 .set_output_field('e')\
                                 .set_extractor(e)
        updated_doc = ep.extract(doc)

        self.assertEqual(updated_doc['e'][0]['result']['value'], 'world')
예제 #12
0
    def test_ad_classifier(self):
        embeddings = self.load_embeddings(UNIGRAM_FILE)
        doc = {"readability_text": "Massage in London SW1 | victoriatantric.co.uk | Tantric I added my business 'Voluptas Tantric Massage' to CityLocal, the premier business directory in Westminster If you'...  http://t.co/QYKzbNYgOY Voluptas Tantric Massage in Victoria is a great place to spend and hour or two. Luxury Apartment for that special London massage. Tantric massage London SW1"}
        extractor = digAdsClassifier.ads_classifier.AdsClassifier()
        extractor.set_embeddings(embeddings)
        extractor_processor = ExtractorProcessor().set_input_fields('readability_text').set_output_field('ad_type').set_extractor(extractor)

        updated_doc = extractor_processor.extract(doc)

        self.assertEquals(updated_doc['ad_type'][0]['value'], 'massage-parlor')
    def test_drug_use_extractor(self):
        doc = {'content': 'she is on drugs a dope w****s and a junkie who is on some heavy drugs and does drugs and an adict, smells bad', 'b': 'world'}

        extractor = DrugUseExtractor().set_metadata({'extractor': 'drug'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertEqual(updated_doc['extracted'][0]['result'][0]['value'],
                         ['dope w****s', 'junkie', 'adict', 'on drugs',
                         'on some heavy drugs', 'does drugs'])
예제 #14
0
    def test_posting_extractor(self):
        doc = {'content': 'online: Jul 07, 00:44 \n  ', 'b': 'world'}

        extractor = PostingDateExtractor().set_metadata(
            {'extractor': 'posting_date'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertEqual(updated_doc['extracted'][0]['result']['value'],
                         '2016-07-07')
예제 #15
0
    def test_single_renamed_field_missing_extractor(self):
        doc = {'b': 'world'}
        e = SampleSingleRenamedFieldExtractor()
        ep = ExtractorProcessor().set_input_fields('a')\
                                 .set_output_field('e')\
                                 .set_extractor(e)
        updated_doc = ep.extract(doc)

        self.assertTrue('e' not in updated_doc)
        self.assertEqual(updated_doc['b'], 'world')
    def test_price_extractor(self):
        doc = {'content': 'Good morning I\'m doing incalls only gentleman I\'m quick 60 roses ?Hhr 80 roses ?Hour 120 roses unrushed and f.service provided nonnegotiable donations  614-563-3342', 'b': 'world'}

        extractor = PriceExtractor().set_metadata({'extractor': 'price'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)

        # print updated_doc['extracted'][0]['result']['value']

        self.assertEqual(updated_doc['extracted'][0]['result']['value'], {'price': [{'price': '60', 'price_unit': 'rose', 'time_unit': 'hhr'}, {'price': '80', 'price_unit': 'rose', 'time_unit': 'hour'}, {'price': '120', 'price_unit': 'rose', 'time_unit': ''}], 'price_per_hour': '80'})
예제 #17
0
    def test_multiple_renamed_field_extractor(self):
        doc = {'a': 'hello', 'b': 'world'}
        e = SampleMultipleRenamedFieldExtractor()
        ep = ExtractorProcessor().set_input_fields(['a', 'b'])\
                                 .set_output_field('e')\
                                 .set_extractor(e)
        updated_doc = ep.extract(doc)

        self.assertEqual(updated_doc['e'][0]['result']['value'], 'helloworld')
        self.assertEqual(updated_doc['a'], 'hello')
        self.assertEqual(updated_doc['b'], 'world')
예제 #18
0
    def test_country_predictory_extractor(self):
        city_to_country = self.load_file("city_to_country.json")
        doc = {
            "cities": ["seattle", "san francisco", "bogota", "minneapolis"]
        }  # load ad as dictionary
        e = CountryPredictorExtractor().set_city_to_country(city_to_country)
        ep = ExtractorProcessor().set_input_fields('cities').set_output_field(
            'country').set_extractor(e)

        updated_doc = ep.extract(doc)
        self.assertEquals(updated_doc['country'][0]['value'], list(['usa']))
예제 #19
0
 def test_readability_extractor(self):
     dig_html = self.load_file("dig.html")
     dig_text = self.load_file("dig.txt")
     doc = {"foo": dig_html}
     e = ReadabilityExtractor()
     ep = ExtractorProcessor().set_input_fields('foo')\
                              .set_output_field('extracted')\
                              .set_extractor(e)
     updated_doc = ep.extract(doc)
     self.assertEquals(updated_doc['extracted'][0]['result']['value'],
                       dig_text)
예제 #20
0
    def test_empty_tokens(self):
        doc = {}
        extractor = socialmedia_id_extractor.SocialMediaIdExtractor()
        extractor_processor = ExtractorProcessor().set_input_fields(
            'tokens').set_output_field('social_media_ids').set_extractor(
                extractor)

        updated_doc = extractor_processor.extract(doc)

        #print updated_doc

        self.assertEquals(updated_doc, {})
    def test_age_regex_extractor(self):
        doc = {'content': "32years old ,im 23",
               'b': 'world'}

        extractor = get_age_regex_extractor()
        extractor_processor = ExtractorProcessor().set_input_fields(
            'content').set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)

        result1 = updated_doc['extracted'][0]['result'][0]
        result2 = updated_doc['extracted'][0]['result'][1]
        self.assertEqual(result1['value'], '32')
        self.assertEqual(result2['value'], '23')
    def test_name_extractor(self):
        names = self.load_file()
        t = populate_trie(map(lambda x: x.lower(), names))
        self.assertTrue(isinstance(t.get('barbara'), basestring))
        self.assertFalse(isinstance(t.get('bar'), basestring))

        doc = {"foo": ['bar', 'Barbara']}
        e = get_name_dictionary_extractor(t)
        ep = ExtractorProcessor().set_input_fields('foo').set_output_field(
            'names').set_extractor(e)

        updated_doc = ep.extract(doc)
        self.assertEquals(updated_doc['names'][0]['result'][0]['value'],
                          'barbara')
예제 #23
0
    def test_cve_extractor(self):
        doc = {
            'text':
            'Sample cves are CVE-1993-1344 ascasdsadfjskdvnjdvn CVE-2006-1232'
        }

        extractor = CveExtractor().set_metadata({'extractor': 'cve'})
        ep = ExtractorProcessor().set_input_fields(['text'])\
                                 .set_output_field('extracted')\
                                 .set_extractor(extractor)
        updated_doc = ep.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'CVE-1993-1344')
        self.assertEqual(result[1]['value'], 'CVE-2006-1232')
예제 #24
0
    def test_height_weight_extractor_empty(self):
        doc = {
            'content':
            "\n TS RUBI: THE NAME SAYS IT ALL!  \n INCALL $250 OUTCALL $350 \n \n \n \n \n \n Gender \n Age \n Ethnicity \n Hair Color \n Eye Color \n Height \n Weight \n Measurements \n Affiliation \n Availability \n Available To \n \n \n \n \n Transsexual \n 27 \n Latino/Hispanic \n Brown \n Hazel \n 34C - 28\" - 34\" \n ",
            'b': 'world'
        }

        extractor = HeightWeightExtractor().set_metadata(
            {'extractor': 'height_weight'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)

        self.assertNotIn("extracted", updated_doc)
    def test_age_extractor_context(self):
        doc = {'content': ['FOV', 'HELLO', 'WORLD', 'hot', 'towel', 'treatment', 'other'], 'b': 'world'}

        extractor = get_service_extractor().set_metadata({'extractor': 'service'})
        extractor.set_include_context(True)
        extractor_processor = ExtractorProcessor().set_input_fields(
            'content').set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'fov')
        self.assertEqual(result[0]['context']['start'], 0)
        self.assertEqual(result[0]['context']['end'], 1)
        self.assertEqual(result[1]['value'], 'hot towel treatment')
        self.assertEqual(result[1]['context']['start'], 3)
        self.assertEqual(result[1]['context']['end'], 6)
예제 #26
0
    def test_nationality_type_extractor(self):
        doc = {
            'content':
            ['American', 'Antiguans', 'Panamanian', 'Hello', 'World'],
            'b': 'world'
        }

        extractor = get_nationality_extractor().set_metadata(
            {'extractor': 'nationality'})
        ep = ExtractorProcessor().set_input_fields('content').set_output_field(
            'extracted').set_extractor(extractor)
        updated_doc = ep.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'american')
        self.assertEqual(result[1]['value'], 'antiguans')
        self.assertEqual(result[2]['value'], 'panamanian')
def tokenize(text, method='dig'):
    tokens = list()

    if (method == 'nltk'):
        for s in sent_tokenize(text):
            word_tokens += word_tokenize(s)

    elif (method == 'dig'):
        doc = {'string': text}
        e = TokenizerExtractor()
        ep = ExtractorProcessor().set_input_fields('string').set_output_field(
            'output').set_extractor(e)
        updated_doc = ep.extract(doc)
        word_tokens = updated_doc['output'][0]['result'][0]['value']

    return word_tokens
예제 #28
0
    def test_height_weight_extractor(self):
        extractor = HeightWeightExtractor().set_metadata(
            {'extractor': 'height_weight'})
        extractor_processor = ExtractorProcessor().set_input_fields(
            ['content']).set_output_field('extracted').set_extractor(extractor)

        for doc in self.groundtruth_data[:20]:
            updated_doc = extractor_processor.extract(doc)
            self.assertIn('extracted', updated_doc)
            self.assertTrue(len(updated_doc['extracted']) > 0)
            extraction = updated_doc['extracted'][0]['result']['value']

            if 'height' in doc:
                self.assertIn('height', extraction)
                self.assertEqual(extraction['height'], doc['height'])
            if 'weight' in doc:
                self.assertIn('weight', extraction)
                self.assertEqual(extraction['weight'], doc['weight'])
예제 #29
0
    def _tokenize_field(obj, field, method='dig'):
        """
        At present, we'll deal with only one field (e.g. readability_text). The field could be a unicode
        or a list, so make sure to take both into account.

        We are not preprocessing the tokens in any way. For this, I'll write another function.
        :param obj: the adultservice json object
        :param field: e.g. 'readability_text'
        :return: A list of tokens.
        """

        word_tokens = list()

        if (method == 'nltk'):
            list_of_sentences = list()

            if field not in obj:
                return None
            elif type(obj[field]) == list:
                k = list()
                k.append(obj[field])
                list_of_sentences += k
            else:
                tmp = list()
                tmp.append(obj[field])
                k = list()
                k.append(tmp)
                # print k
                list_of_sentences += k  # we are assuming this is a unicode/string
            for sentences in list_of_sentences:
                # print sentences
                for sentence in sentences:
                    for s in sent_tokenize(sentence):
                        word_tokens += word_tokenize(s)

        elif (method == 'dig'):
            doc = {'string': obj[field]}
            e = TokenizerExtractor()
            ep = ExtractorProcessor().set_input_fields(
                'string').set_output_field('output').set_extractor(e)
            updated_doc = ep.extract(doc)
            word_tokens = updated_doc['output'][0]['result'][0]['value']

        return word_tokens
예제 #30
0
    def test_age_extractor(self):
        doc = {
            'content': [
                "Poster's", "age", "26", "Location", "Orlando", "Post", "ID",
                "12295358", "Date", "June", "25", "2015"
            ],
            'b':
            'world'
        }

        extractor = get_age_dictionary_extractor()
        extractor_processor = ExtractorProcessor().set_input_fields(
            'content').set_output_field('extracted').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)

        result1 = updated_doc['extracted'][0]['result'][0]
        result2 = updated_doc['extracted'][0]['result'][1]
        self.assertEqual(result1['value'], '26')
        self.assertEqual(result2['value'], '25')
    def test_business_type_extractor(self):
        doc = {
            'content':
            "  DOWNTOWN NEW HOT ASIAN ANGEL 100% REAL PICS Sexy Face Young  - 19      - Click to save or unsave                                                    Posted:  4 months ago                                        Age:  19                                   Category:   Phoenix Escorts                    Hi, Guys,   My Name is Lydia, and I new to the area,   I am a Korean and Spanish mix college student.only paritime, I am 20yrs, 5'4, 34D-23-35.   You will enjoy our time together guaranteed. 100% me.. *I'm "
            "the REAL deal"
            "* I'm a Sweet, FUN playmate that knows how to have a good time!   Never Say No!! Never Rush!!  Call or Txt: 929-272-7898, TXT: 647-687-7096, Wechat: aa5854660383                        (929) 272-7898                      |  929.272.7898                      |  929-272-7898                      |  (929)272-7898                      |  9292727898     Flag this ad     Hi, Guys, My Name is Lydia, and I new to the area, I am a Korean and Spanish mix college student.only paritime, I am 20yrs, 5'4, 34D-23-35. You will enjoy our time together guaranteed. 100% me.. *I'm "
            "the REAL deal"
            "* I'm a Sweet, FUN playmate that knows how to have a good time! Never Say No!! Never Rush!! Call or Txt: 929-272-7898, TXT: 647-687-7096, Wechat: aa5854660383 DOWNTOWN NEW HOT ASIAN ANGEL 100% REAL PICS Sexy Face Young  - 19 DOWNTOWN NEW HOT ASIAN ANGEL 100% REAL PICS Sexy Face Young  - 19 - A Sexy Service.com",
            'b':
            'world'
        }

        metadata = {'extractor': 'business_type'}
        extractor = BusinessTypeExtractor().set_metadata(metadata)
        ep = ExtractorProcessor().set_input_fields(['content'])\
                                 .set_output_field('extracted')\
                                 .set_extractor(extractor)
        updated_doc = ep.extract(doc)
        result = updated_doc['extracted'][0]['result']
        self.assertEqual(result[0]['value'], 'escort')
예제 #32
0
    def test_socialmedia_id_extractor(self):
        doc = {
            "tokens": [
                "adair", "location", ":", "escorts", "missouri", "escorts",
                "kansas", "city", "escorts", "adair", "my", "information",
                "follow", "me", "on", "twitter", "@", "DiamondSquirt",
                "location", ":", "kansas", "city", "escorts", "type", ":"
            ]
        }

        #print doc["tokens"][0]["result"][0]["value"]
        extractor = socialmedia_id_extractor.SocialMediaIdExtractor()
        extractor_processor = ExtractorProcessor().set_input_fields(
            'tokens').set_output_field('social_media_ids').set_extractor(
                extractor)

        updated_doc = extractor_processor.extract(doc)

        self.assertEquals(
            updated_doc['social_media_ids'][0]['result']['value'],
            {'twitter': 'diamondsquirt'})
예제 #33
0
    def test_multiple_renamed_field_with_multiple_values_extractor(self):
        doc = {
            'a': 'hello',
            'b': [{
                'c': 'world'
            }, {
                'c': 'brooklyn'
            }, {
                'c': 'new york'
            }]
        }
        e = SampleMultipleRenamedFieldExtractor()
        ep = ExtractorProcessor().set_input_fields(['a', 'b[*].c'])\
                                 .set_output_field('e')\
                                 .set_extractor(e)
        updated_doc = ep.extract(doc)

        self.assertEqual(updated_doc['e'][0]['result']['value'], 'helloworld')
        self.assertEqual(updated_doc['e'][1]['result']['value'],
                         'hellobrooklyn')
        self.assertEqual(updated_doc['e'][2]['result']['value'],
                         'hellonew york')
        self.assertEqual(updated_doc['a'], 'hello')