def get_all_words(example_result_tuples):
    
    word_model = Model(True,need_stem=True)

    for single_tuple in example_result_tuples:
        word_model += Sentence(single_tuple['sentence'],remove_stopwords=True).stemmed_model

    word_model.normalize()

    return word_model
Пример #2
0
def get_all_words(example_result_tuples):

    word_model = Model(True, need_stem=True)

    for single_tuple in example_result_tuples:
        word_model += Sentence(single_tuple['sentence'],
                               remove_stopwords=True).stemmed_model

    word_model.normalize()

    return word_model
def get_single_model(candidate_dir):
    candidate_models = {}
    files = os.walk(candidate_dir).next()[2]
    for a_file in files:
        candidate_models[a_file] = {}
        temp_model = json.load(open(os.path.join(candidate_dir,a_file)))
        for w in temp_model:
            if w not in candidate_models:
                temp = Model(True,text_dict=temp_model[w], need_stem = True, input_stemmed = True)
                temp.normalize()
                candidate_models[a_file][w] = temp

    return candidate_models
Пример #4
0
def get_all_words(tuple_results):
    words = {}
    for identifier in tuple_results:
        word_model = Model(True,need_stem=True)
        for single_tuple in tuple_results[identifier]:
            word_model += Sentence(single_tuple['sentence'],remove_stopwords=True).stemmed_model

        word_model.normalize()
        for word in word_model.model:
            if word not in words:
                words[word] = 0
            words[word] += word_model.model[word]
    return words
Пример #5
0
def get_verbs(tuple_result):
    verbs = {}
    for identifier in tuple_result:
        verb_model = Model(True)
        for single_tuple in tuple_result[identifier]:
            verb = single_tuple['verb']
            if verb not in NO_NEED:
                verb_model.update(text_list=[verb])
        verb_model.normalize()
        for verb in verb_model.model:
            if verb not in verbs:
                verbs[verb] = 0
            verbs[verb] += verb_model.model[verb]
    return verbs
Пример #6
0
def get_model_for_entities(source_dir):
    models = {}
    for instance in os.walk(source_dir).next()[2]:
        data = json.load(open(os.path.join(source_dir,instance)))
        for entity_type in data:
            if entity_type not in models:
                temp = Model(True,need_stem = True)
                temp.normalize()
                models[entity_type] = temp
            for entity in data[entity_type]:
                temp = Model(True,text_dict=data[entity_type][entity],need_stem=True,input_stemmed=True)
                temp.normalize()
                models[entity_type] += temp 
    return models
Пример #7
0
def get_models(results,index_dir):
    models = {}
    for qid in results:
        if qid not in models:
            models[qid] = {}
        for day in results[qid]:
            single_model = Model(True,need_stem=True)
            for tid in results[qid][day]:
                text = get_text(index_dir,tid)
                if text:
                    single_model.update(text_string=text)
            single_model.normalize()     
            models[qid][day] = single_model.model
    
    return models
Пример #8
0
def get_single_model(candidate_dir):
    candidate_models = {}
    files = os.walk(candidate_dir).next()[2]
    for a_file in files:
        candidate_models[a_file] = {}
        temp_model = json.load(open(os.path.join(candidate_dir, a_file)))
        for w in temp_model:
            if w not in candidate_models:
                temp = Model(True,
                             text_dict=temp_model[w],
                             need_stem=True,
                             input_stemmed=True)
                temp.normalize()
                candidate_models[a_file][w] = temp

    return candidate_models
Пример #9
0
def get_model_for_entities(source_dir):
    models = {}
    for instance in os.walk(source_dir).next()[2]:
        data = json.load(open(os.path.join(source_dir, instance)))
        for entity_type in data:
            if entity_type not in models:
                temp = Model(True, need_stem=True)
                temp.normalize()
                models[entity_type] = temp
            for entity in data[entity_type]:
                temp = Model(True,
                             text_dict=data[entity_type][entity],
                             need_stem=True,
                             input_stemmed=True)
                temp.normalize()
                models[entity_type] += temp
    return models
Пример #10
0
def get_all_verbs(example_result_tuples):
    verb_model = Model(True, need_stem=True)

    for single_tuple in example_result_tuples:
        word = single_tuple['verb']
        if single_tuple['verb_label'] != 'VB':
            word = WordNetLemmatizer().lemmatize(word, 'v')
        try:
            verb_model.update(text_list=[str(word)])
        except TypeError:
            print "Wrong Word!"
            print word
            print type(word)
            print single_tuple
            sys.exit(0)
    verb_model.normalize()

    return verb_model
def get_all_verbs(example_result_tuples):
    verb_model = Model(True,need_stem=True)

    for single_tuple in example_result_tuples:
        word = single_tuple['verb']
        if single_tuple['verb_label'] != 'VB':
            word = WordNetLemmatizer().lemmatize(word,'v')
        try:
            verb_model.update(text_list=[str(word)])
        except TypeError:
            print "Wrong Word!"
            print word
            print type(word)
            print single_tuple
            sys.exit(0)
    verb_model.normalize()

    return verb_model
def get_sub_features(model,size):
    """
    get top terms as features   
    """

    data = Model(True, need_stem = True, input_stemmed = True)
    for instance in model:
        for w in model[instance]:
            data += model[instance][w]
    data.normalize()
    terms = data.model
    sorted_terms = sorted(terms.items(),key = lambda x:x[1],reverse=True)
    i = 0
    features = {}
    for (w,v) in sorted_terms:
        features[w] = v
        i += 1
        if i == size:
            print "break when i is",i
            break
    print "get %d features" %(len(features))
    return features
Пример #13
0
def get_sub_features(model, size):
    """
    get top terms as features   
    """

    data = Model(True, need_stem=True, input_stemmed=True)
    for instance in model:
        for w in model[instance]:
            data += model[instance][w]
    data.normalize()
    terms = data.model
    sorted_terms = sorted(terms.items(), key=lambda x: x[1], reverse=True)
    i = 0
    features = {}
    for (w, v) in sorted_terms:
        features[w] = v
        i += 1
        if i == size:
            print "break when i is", i
            break
    print "get %d features" % (len(features))
    return features