Exemplo n.º 1
0
def get_search_results(query, exact_match=False):
    query = join_bigrams.bigramar(query.lower(), bigrams)
    query = [query] if exact_match else query.split()
    results = [[], [], []]
    for row in data:
        query_match_level = 0
        for query_word in query:
            search_list = [query_word] if exact_match else [
                query_word,
                Inflector.pluralize(query_word),
                Inflector.singularize(query_word)
            ]
            word_match_level = consts.NOT_FOUND
            for word in search_list:
                word_match_level = min(word_match_level,
                                       get_match_level(word, row))
            query_match_level = max(query_match_level, word_match_level)
        if query_match_level != consts.NOT_FOUND:
            results[query_match_level].append(copy.copy(row))
    merged_results = results[0] + results[1] + results[2]
    for result in merged_results:
        for level in range(3):
            for column in consts.SEARCH_COLUMNS[level]:
                result.pop(column + '_processed')
    return json.dumps(merged_results)
Exemplo n.º 2
0
def process_spanish_owned():
    from inflector import Inflector, Spanish
    inflector = Inflector(Spanish)

    from nltk.stem import SnowballStemmer
    stemmer = SnowballStemmer("spanish")

    file_valid = open('valid_words.txt', "r")
    lines = file_valid.readlines()
    valid_words = lines[0].split(' ')
    print len(valid_words)
    file_valid.close()
    #valid_words = set(valid_words)
    owned_words = ['cúster', 'custer', 'cústers', 'custers', 'combi', 'combis', 'susana', 'villaran', 'villarán', 'castañeda']

    file = open("raw_words.txt", 'r')
    fileout = open("spanish_words_owned.txt", 'w')
    fout_sing = open("spanish_words_sing.txt", 'w')
    fout_stem = open("spanish_words_stem.txt", 'w')
    nline = 0

    for line in file:
        nline += 1
        words = line.split(' ')
        processed = []
        ini_line = True
        for word in words:
            if (word != '') & (word != '\n') & (word != 'servicio') & (word != 'servicio\n'):
                word = word.replace('\n', '')
                if (word in valid_words) | (word in owned_words):
                    processed.append(word)
                    if word != 'bus':
                        word_singular = inflector.singularize(word)
                        #word_singular = word_singular.replace(u'\xF3'.encode('utf-8'), 'o')
                    else:
                        word_singular = word
                    word_stemmed = stemmer.stem(word.decode('utf-8')).encode('utf-8')
                    if ini_line:
                        fileout.write(word)
                        fout_sing.write(word_singular)
                        fout_stem.write(word_stemmed)
                        ini_line = False
                    else:
                        fileout.write(' ' + word)
                        fout_sing.write(' ' + word_singular)
                        fout_stem.write(' ' + word_stemmed)
                    print nline, word, word_singular, word_stemmed
        fileout.write('\n')
        fout_sing.write('\n')
        fout_stem.write('\n')
    file.close()
    fileout.close()
    fout_sing.close()
    fout_stem.close()
Exemplo n.º 3
0
def count_noun(tagged_tokens):
    """
    명사 추출 및 복수를 단수로 변환하는 작업
    :param tagged_tokens:
    :return:
    """
    noun_dict = defaultdict(lambda: 0)
    except_noun = [".", ",", "$", "[", "]", ">", "<", "/*", "*/", "*", "+", "-", "=", "%"]
    mongo_error_keyword = ['.', ',', '$']
    inflector = Inflector(English)

    for tagged_token in tagged_tokens:
        if "NN" in tagged_token[1]:
            noun = inflector.singularize(tagged_token[0].lower())

            if noun in except_noun \
                    or any(filter(lambda x: x in noun, mongo_error_keyword)) \
                    or not noun:
                continue

            noun_dict[noun] += 1

    return dict(noun_dict)
Exemplo n.º 4
0
class EnglishInflectorTestCase(unittest.TestCase):
    singular_to_plural = {
        "search"      : "searches",
        "switch"      : "switches",
        "fix"         : "fixes",
        "box"         : "boxes",
        "process"     : "processes",
        "address"     : "addresses",
        "case"        : "cases",
        "stack"       : "stacks",
        "wish"        : "wishes",
        "fish"        : "fish",
    
        "category"    : "categories",
        "query"       : "queries",
        "ability"     : "abilities",
        "agency"      : "agencies",
        "movie"       : "movies",
    
        "archive"     : "archives",
    
        "index"       : "indices",
    
        "wife"        : "wives",
        "safe"        : "saves",
        "half"        : "halves",
    
        "move"        : "moves",
    
        "salesperson" : "salespeople",
        "person"      : "people",
    
        "spokesman"   : "spokesmen",
        "man"         : "men",
        "woman"       : "women",
    
        "basis"       : "bases",
        "diagnosis"   : "diagnoses",
    
        "datum"       : "data",
        "medium"      : "media",
        "analysis"    : "analyses",
    
        "node_child"  : "node_children",
        "child"       : "children",
    
        "experience"  : "experiences",
        "day"         : "days",
    
        "comment"     : "comments",
        "foobar"      : "foobars",
        "newsletter"  : "newsletters",
    
        "old_news"    : "old_news",
        "news"        : "news",
    
        "series"      : "series",
        "species"     : "species",
    
        "quiz"        : "quizzes",
    
        "perspective" : "perspectives",
    
        "ox" : "oxen",
        "photo" : "photos",
        "buffalo" : "buffaloes",
        "tomato" : "tomatoes",
        "dwarf" : "dwarves",
        "elf" : "elves",
        "information" : "information",
        "equipment" : "equipment",
        "bus" : "buses",
        "status" : "statuses",
        "mouse" : "mice",
    
        "louse" : "lice",
        "house" : "houses",
        "octopus" : "octopi",
        "virus" : "viri",
        "alias" : "aliases",
        "portfolio" : "portfolios",
    
        "vertex" : "vertices",
        "matrix" : "matrices",
    
        "axis" : "axes",
        "testis" : "testes",
        "crisis" : "crises",
    
        "rice" : "rice",
        "shoe" : "shoes",
    
        "horse" : "horses",
        "prize" : "prizes",
        "edge" : "edges"
    }
    def setUp(self):
        self.inflector = Inflector(English)
    
    def tearDown(self):
        self.inflector = None

    def test_pluralize(self) :
        for singular in self.singular_to_plural.keys() :
            assert self.inflector.pluralize(singular) == self.singular_to_plural[singular], \
            'English Inlector pluralize(%s) should produce "%s" and NOT "%s"' % (singular, self.singular_to_plural[singular], self.inflector.pluralize(singular))
            
    def test_singularize(self) :
        for singular in self.singular_to_plural.keys() :
            assert self.inflector.singularize(self.singular_to_plural[singular]) == singular, \
            'English Inlector singularize(%s) should produce "%s" and NOT "%s"' % (self.singular_to_plural[singular], singular, self.inflector.singularize(self.singular_to_plural[singular]))
Exemplo n.º 5
0
class EnglishInflectorTestCase(unittest.TestCase):
    singular_to_plural = {
        "search": "searches",
        "switch": "switches",
        "fix": "fixes",
        "box": "boxes",
        "process": "processes",
        "address": "addresses",
        "case": "cases",
        "stack": "stacks",
        "wish": "wishes",
        "fish": "fish",
        "category": "categories",
        "query": "queries",
        "ability": "abilities",
        "agency": "agencies",
        "movie": "movies",
        "archive": "archives",
        "index": "indices",
        "wife": "wives",
        "safe": "saves",
        "half": "halves",
        "move": "moves",
        "salesperson": "salespeople",
        "person": "people",
        "spokesman": "spokesmen",
        "man": "men",
        "woman": "women",
        "basis": "bases",
        "diagnosis": "diagnoses",
        "datum": "data",
        "medium": "media",
        "analysis": "analyses",
        "node_child": "node_children",
        "child": "children",
        "experience": "experiences",
        "day": "days",
        "comment": "comments",
        "foobar": "foobars",
        "newsletter": "newsletters",
        "old_news": "old_news",
        "news": "news",
        "series": "series",
        "species": "species",
        "quiz": "quizzes",
        "perspective": "perspectives",
        "ox": "oxen",
        "photo": "photos",
        "buffalo": "buffaloes",
        "tomato": "tomatoes",
        "dwarf": "dwarves",
        "elf": "elves",
        "information": "information",
        "equipment": "equipment",
        "bus": "buses",
        "status": "statuses",
        "mouse": "mice",
        "louse": "lice",
        "house": "houses",
        "octopus": "octopi",
        "virus": "viri",
        "alias": "aliases",
        "portfolio": "portfolios",
        "vertex": "vertices",
        "matrix": "matrices",
        "axis": "axes",
        "testis": "testes",
        "crisis": "crises",
        "rice": "rice",
        "shoe": "shoes",
        "horse": "horses",
        "prize": "prizes",
        "edge": "edges"
    }

    def setUp(self):
        self.inflector = Inflector(English)

    def tearDown(self):
        self.inflector = None

    def test_pluralize(self):
        for singular in self.singular_to_plural.keys():
            assert self.inflector.pluralize(singular) == self.singular_to_plural[singular], \
            'English Inlector pluralize(%s) should produce "%s" and NOT "%s"' % (singular, self.singular_to_plural[singular], self.inflector.pluralize(singular))

    def test_singularize(self):
        for singular in self.singular_to_plural.keys():
            assert self.inflector.singularize(self.singular_to_plural[singular]) == singular, \
            'English Inlector singularize(%s) should produce "%s" and NOT "%s"' % (self.singular_to_plural[singular], singular, self.inflector.singularize(self.singular_to_plural[singular]))
Exemplo n.º 6
0
def get_question_type(q_word, question):
    q_word = q_word.lower()
    question = question.lower()
    inf = Inflector()
    question = inf.singularize(question)

    if q_word == 'what' or q_word == 'which':
        if 'what country' in question or \
            'what state' in question or \
            'what continental' in question or \
            'what place' in question or \
            'what city' in question or \
            'what province' in question or \
            'what river' in question or \
            'what region' in question or \
            'what area' in question or \
            'what nationality' in question or \
            'what town' in question or \
            'what borough' in question or \
            'what location' in question:
            return set(['LOCATION'])

        if 'what year' in question or \
            'what month' in question or \
            'what day' in question or \
            'what date' in question:
            return set(['DATE'])

        if 'what percentage' in question or 'what percent' in question:
            return set(['PERCENT'])

        if 'what company' in question or \
            'what group' in question or \
            'what organization' in question or \
            'what university' in question or \
            'what school' in question or \
            'what team' in question or \
            'what program' in question or \
            'what party' in question:
            return set(['ORGANIZATION'])

        if 'what artist' in question or \
            'what actor' in question or \
            'what actress' in question or \
            'what doctor' in question or \
            'what president' in question or \
            'what person' in question:
            return set(['PERSON'])

        if 'which country' in question or \
            'which state' in question or \
            'which continental' in question or \
            'which place' in question or \
            'which city' in question or \
            'which province' in question or \
            'which river' in question or \
            'which region' in question or \
            'which area' in question or \
            'which nationality' in question or \
            'which town' in question or \
            'which borough' in question or \
            'which location' in question:
            return set(['LOCATION'])

        if 'which year' in question or \
            'which month' in question or \
            'which day' in question or \
            'which date' in question:
            return set(['DATE'])

        if 'which percentage' in question or 'which percent' in question:
            return set(['PERCENT'])

        if 'which company' in question or \
            'which group' in question or \
            'which organization' in question or \
            'which university' in question or \
            'which school' in question or \
            'which team' in question or \
            'which program' in question or \
            'which party' in question:
            return set(['ORGANIZATION'])

        if 'which artist' in question or \
            'which actor' in question or \
            'which actress' in question or \
            'which doctor' in question or \
            'which president' in question or \
            'which person' in question:
            return set(['PERSON'])
    elif q_word == 'how':
        if 'how much' in question:
            return set(['MONEY', 'NUMBER'])
        if 'how long' in question or 'how old' in question:
            return set(['TIME', 'DURATION'])
        if 'how many' in question or 'how far' in question:
            return set(['NUMBER'])
    elif q_word == 'where':
        return set(['LOCATION', 'ORGANIZATION'])
    elif q_word == 'when':
        return set(['DATE', 'TIME', 'DURATION'])
    elif q_word == 'who':
        return set(['PERSON'])

    return set(['O'])
Exemplo n.º 7
0
class SpanishInflectorTestCase(unittest.TestCase):
    singular_to_plural = {
        "álbum": "álbumes",
        "almacén": "almacenes",
        "androide": "androides",
        "antifaz": "antifaces",
        "árbol": "árboles",
        "atlas": "atlas",
        "autobús": "autobuses",
        "base": "bases",
        "bebé": "bebés",
        "camión": "camiones",
        "casa": "casas",
        "ceutí": "ceutíes",
        "chimpancé": "chimpancés",
        "clan": "clanes",
        "compás": "compases",
        "convoy": "convoyes",
        "coxis": "coxis",
        "crisis": "crisis",
        "déficit": "déficits",
        "eje": "ejes",
        "espíritu": "espíritus",
        "flash": "flashes",
        "frac": "fracs",
        "gafas": "gafas",
        "hipótesis": "hipótesis",
        "inglés": "ingleses",
        "lápiz": "lápices",
        "luz": "luces",
        "montaje": "montajes",
        "no": "noes",
        "otitis": "otitis",
        "padre": "padres",
        "país": "países",
        "papá": "papás",
        "parking": "parkings",
        "portaequipaje": "portaequipajes",
        "radiocasete": "radiocasetes",
        "show": "shows",
        "si": "sis",
        "sí": "síes",
        "tabú": "tabúes",
        "tamiz": "tamices",
        "tanque": "tanques",
        "taxi": "taxis",
        "tijeras": "tijeras",
        "tren": "trenes",
        "virus": "virus",
    }

    def setUp(self):
        self.inflector = Inflector(Spanish)

    def tearDown(self):
        self.inflector = None

    def test_pluralize(self):
        for singular, plural in self.singular_to_plural.iteritems():
            inflector_pluralize = self.inflector.pluralize(singular)
            assert inflector_pluralize == plural, \
                'Spanish Inflector pluralize(%s) should produce "%s" and NOT "%s"' % (
                    singular, plural, inflector_pluralize)

    def test_singularize(self):
        for singular, plural in self.singular_to_plural.iteritems():
            inflector_singularize = self.inflector.singularize(plural)
            assert inflector_singularize == singular, \
                'Spanish Inflector singularize(%s) should produce "%s" and NOT "%s"' % (
                    plural, singular, inflector_singularize)
Exemplo n.º 8
0
class SpanishInflectorTestCase(unittest.TestCase):
    singular_to_plural = {
        "álbum": "álbumes",
        "almacén": "almacenes",
        "androide": "androides",
        "antifaz": "antifaces",
        "árbol": "árboles",
        "atlas": "atlas",
        "autobús": "autobuses",
        "base": "bases",
        "bebé": "bebés",
        "camión": "camiones",
        "casa": "casas",
        "ceutí": "ceutíes",
        "chimpancé": "chimpancés",
        "clan": "clanes",
        "compás": "compases",
        "convoy": "convoyes",
        "coxis": "coxis",
        "crisis": "crisis",
        "déficit": "déficits",
        "eje": "ejes",
        "espíritu": "espíritus",
        "flash": "flashes",
        "frac": "fracs",
        "gafas": "gafas",
        "hipótesis": "hipótesis",    
        "inglés": "ingleses",
        "lápiz": "lápices",
        "luz": "luces",
        "montaje": "montajes",
        "no": "noes",
        "otitis": "otitis",
        "padre": "padres",
        "país": "países",
        "papá": "papás",
        "parking": "parkings",
        "portaequipaje": "portaequipajes",
        "radiocasete": "radiocasetes",
        "show": "shows",
        "si": "sis",
        "sí": "síes",
        "tabú": "tabúes",
        "tamiz": "tamices",
        "tanque": "tanques",
        "taxi": "taxis",
        "tijeras": "tijeras",
        "tren": "trenes",
        "virus": "virus",
    }

    def setUp(self):
        self.inflector = Inflector(Spanish)

    def tearDown(self):
        self.inflector = None

    def test_pluralize(self):
        for singular, plural in self.singular_to_plural.iteritems():
            inflector_pluralize = self.inflector.pluralize(singular)
            assert inflector_pluralize == plural, \
                'Spanish Inflector pluralize(%s) should produce "%s" and NOT "%s"' % (
                    singular, plural, inflector_pluralize)

    def test_singularize(self):
        for singular, plural in self.singular_to_plural.iteritems():
            inflector_singularize = self.inflector.singularize(plural)
            assert inflector_singularize == singular, \
                'Spanish Inflector singularize(%s) should produce "%s" and NOT "%s"' % (
                    plural, singular, inflector_singularize)