Python get_words примеры, util.get_words Python примеры использования

Пример #1

0

Показать файл

def eval_seq_train(gold, pre, labels, hc = None, features = None):
    """
    evaluate a sequence labeler
    """
    n = len(gold)
    tp = 0
    fp = 0
    fn = 0

    for i in range(n):
        (x, y, z) = eval_ner(gold[i], pre[i], labels)
        tp += x
        fp += y
        fn += z
        if hc != None:
            if y + z > 0:
                sen = hc.sentences[i]
                print "sen: ",  util.get_words(sen, features) + " OOV = " + str(has_oov(sen))
                print "true labels: ", util.get_lab_name(gold[i], labels)
                print "predicted: ", util.get_lab_name(pre[i], labels)

    try:
        pre = tp * 1.0 / (tp + fp)
        rec = tp * 1.0 / (tp + fn)
        f = 2.0 * pre * rec / (pre + rec)
        print pre, rec, f
    except ZeroDivisionError:
        print "DIV BY 0 ", tp, fp, fn

Пример #2

0

Показать файл

Файл: data_processing.py Проект: Zendom88/BestBuy

def correct_query(raw_query, lemmatizer, local_cache):
    raw_query = raw_query.lower().strip()
    if raw_query in local_cache:
        return local_cache[raw_query]
    words = get_words(raw_query)
    new_words = list()
    for w in words:
        split = get_split(w)
        if type(split) == type(()):
            new_words.extend(list(split))
        else:
            new_words.append(split)
    new_query = ''
    for w in new_words:
        lemma = lemmatizer.lemmatize(w)
        if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha():
            split = split_word_num(w)
            if type(split) == type(()):
                w, num = split
                lemma = ' '.join([w, num])
        new_query += lemma + ' '
    new_query = new_query[0:-1]
    local_cache[raw_query] = new_query
    #print("New query: {}".format(new_query.encode("utf-8")))
    return new_query

Пример #3

0

Показать файл

Файл: main.py Проект: raziakram/solutions

def make_predictions(st_line, ed_line, out_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[
        0]
    reader = readfile(new_test_file)
    writer = writefile(out_file)
    line_idx = 0
    for (user, category, raw_query, click_time) in reader:
        line_idx += 1
        if line_idx < st_line:
            continue
        if line_idx > ed_line:
            break
        if line_idx % TEST_STEP == 0:
            print '%s--%d' % (pname, line_idx / TEST_STEP)
        time_block = get_time_feature(click_time)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            writer.writerow(["0"])
            continue
        try:
            bigram = get_bigram_word(raw_query, hot_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                'only queries hot enough and can generate bigram features can be predicted by boosting model'
                rank = [[
                    sku,
                    boosting_bayes(bigram, words, category, sku, alpha, beta,
                                   item_word, bigram_item_word, item_count,
                                   cat_count, time_cat_item_dict, time_block)
                ] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                'if hot enough but can not generate bigram features then use naive bayes with time information'
                rank = [[
                    sku,
                    time_bayes_query_prediction(words, category, sku, alpha,
                                                beta, item_word, item_count,
                                                cat_count, time_cat_item_dict,
                                                time_block)
                ] for sku in hots]
            else:
                'otherwise use plain naive bayes'
                rank = [[
                    sku,
                    plain_bayes_query_prediction(words, category, sku, alpha,
                                                 beta, item_word, item_count,
                                                 cat_count)
                ] for sku in hots]
            rank = sorted(rank, key=lambda x: x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rerank_guess(guesses, user, raw_query)

            writer.writerow([" ".join(guesses)])
        except (TypeError, KeyError):  # a category we haven't seen before
            writer.writerow([" ".join(hots[0:5])])

Пример #4

0

Показать файл

Файл: model.py Проект: datadrivenempathy/who-wrote-this-server

    def get_title_words(self, dedupe=True):
        """Get the words (lowercase) from the title.

        Args:
            dedupe: Flag indicating if only unique words should be returned. If True, only unique
                words will be returned (in no particular order). If False, all words found will
                be returned in original order with duplicates.
        Returns:
            Iterable over strings representing the words found in the title.
        """
        return util.get_words(self.get_title(), dedupe=dedupe)

Пример #5

0

Показать файл

Файл: views.py Проект: markaurelius/verbquest

def song_search(request):
    query = request.path.split("/")[-1]
    res = song.search(sort='song_hotttnesss-desc',combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0]
    lid = res.get_foreign_id('lyricfind-US').split(":")[-1]
    lyrics = util.get_lyrics(lid)
    words = util.clean_lyrics(lyrics)
    words = util.get_words(words)
    words = util.remove_common_verbs(words)
    sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=True)[0:10]
    sorted_lyric_counts = [x[1] for x in sorted_lyric_data]
    sorted_lyrics = [x[0] for x in sorted_lyric_data]
    response = {'artist' : res.artist_name, 'verbs' : lyrics, 'answer' : res.title, 'sorted_verbs' : sorted_lyrics, 'sorted_verb_counts' : sorted_lyric_counts}
    return render_to_response("song_results.html",response)

Пример #6

0

Показать файл

Файл: application.py Проект: datadrivenempathy/who-wrote-this-server

    def query():
        """Query for prototypical articles within a topic (using "search" url param).

        Returns:
            JSON listing of prototypical records for the given topic.
        """
        query_string = flask.request.args.get('search')
        keywords = util.get_words(query_string)
        report_maybe('query', query_string)
        records = records_keep.query(keywords)
        records_serial = list(
            sorted(map(model.serialize_record_to_dict, records),
                   key=lambda x: x['source']))
        return json.dumps({'records': records_serial})

Пример #7

0

Показать файл

Файл: main.py Проект: raziakram/solutions

def get_unigram_model(item_sort, cat_count):
    reader = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    idx = 0
    for (__user, sku, category, raw_query, ___click_time) in reader:
        idx += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word

Пример #8

0

Показать файл

Файл: main.py Проект: harixxy/solutions

def get_unigram_model(item_sort, cat_count):
    reader = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    idx = 0
    for (__user, sku, category, raw_query, ___click_time) in reader:
        idx += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word

Пример #9

0

Показать файл

Файл: views.py Проект: markaurelius/verbquest

def api_song_search(request):
    query = request.path.split("/")[-1]
    try:
        res = song.search(combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0]
        lid = res.get_foreign_id('lyricfind-US').split(":")[-1]
        lyrics = util.get_lyrics(lid)
        words = util.clean_lyrics(lyrics)
        words = util.get_words(words)
        words = util.remove_common_words(words)
        sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=False)[-20:]
        sorted_lyric_counts = [x[1] for x in sorted_lyric_data]
        sorted_lyrics = [x[0] for x in sorted_lyric_data]
        sorted_lyric_combined = [[x[1],x[0]] for x in sorted_lyric_data]
        response = {'artist' : res.artist_name, 'title' : res.title, 'sorted_words' : sorted_lyric_combined}
    except Exception, e:
        response = {'error' : str(e)}

Пример #10

0

Показать файл

def eval_hc_test(hc, features, labels, print_err=False, decoder='hc'):
    """
    evaluate in the train set
    :param hc:
    :param labels:
    :return:
    """
    tp = 0
    fp = 0
    fn = 0

    dirname = "testa"
    input = []
    for file in os.listdir(dirname):
        # print file
        if file.endswith(".txt"):
            f = open(os.path.join(dirname, file))
            l = list(f)
            input.extend(l)
            f.close()
    # return input
    sentences = util.extract(input, features, labels, keep_word = True)

    # return sentences

    for sen in sentences:
        if True:
            # if not has_oov(sen):
            #predicted = hc.decode(util.get_obs(sen))
            predicted = get_tag(hc, sen, features, decoder)
            (x, y, z) = eval_ner(util.get_lab(sen), predicted, labels)
            tp += x
            fp += y
            fn += z
            if print_err:
                if y + z > 0:
                    print "sen: ",  util.get_words(sen, features) + " OOV = " + str(has_oov(sen))
                    print "true labels: ", util.get_lab_name(util.get_lab(sen), labels)
                    print "predicted: ", util.get_lab_name(predicted, labels)

    try:
        pre = tp * 1.0 / (tp + fp)
        rec = tp * 1.0 / (tp + fn)
        f = 2.0 * pre * rec / (pre + rec)
        print pre, rec, f
    except ZeroDivisionError:
        print "DIV BY 0 ", tp, fp, fn

Пример #11

0

Показать файл

Файл: main.py Проект: harixxy/solutions

def make_predictions(st_line, ed_line, out_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[0]
    reader = readfile(new_test_file)
    writer = writefile(out_file)
    line_idx = 0
    for (user, category, raw_query, click_time) in reader:
        line_idx += 1
        if line_idx < st_line:
            continue
        if line_idx > ed_line:
            break
        if line_idx % TEST_STEP == 0:
            print '%s--%d' % (pname, line_idx / TEST_STEP)
        time_block = get_time_feature(click_time)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            writer.writerow(["0"])
            continue
        try:
            bigram = get_bigram_word(raw_query, hot_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                'only queries hot enough and can generate bigram features can be predicted by boosting model'
                rank = [[sku, boosting_bayes(bigram, words, category, sku, alpha, beta, item_word, bigram_item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                'if hot enough but can not generate bigram features then use naive bayes with time information'
                rank = [[sku, time_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            else:
                'otherwise use plain naive bayes'
                rank = [[sku, plain_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count)] for sku in hots]
            rank = sorted(rank, key=lambda x:x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rerank_guess(guesses, user, raw_query)
            
            writer.writerow([" ".join(guesses)])
        except (TypeError, KeyError): # a category we haven't seen before
            writer.writerow([" ".join(hots[0:5])])

Пример #12

0

Показать файл

Файл: data_processing.py Проект: harixxy/solutions

def correct_query(raw_query, lemmatizer, local_cache):
    raw_query = raw_query.lower().strip()
    if raw_query in local_cache:
        return local_cache[raw_query]
    words = get_words(raw_query)
    new_words = list()
    for w in words:
        split = get_split(w)
        if type(split) == type(()):
            new_words.extend(list(split))
        else:
            new_words.append(split)
    new_query = ''
    for w in new_words:
        lemma = lemmatizer.lemmatize(w)
        if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha():
            split = split_word_num(w)
            if type(split) == type(()):
                w, num = split
                lemma = ' '.join([w, num])
        new_query += lemma + ' '
    new_query = new_query[0:-1]
    local_cache[raw_query] = new_query
    return new_query

Пример #13

0

Показать файл

        #		if embedding == 1:
        #			print h
        #			if h==56:
        #				print "vvww"
        #		h = h + 1
        #	print word
        #print ("len: ", x)
        answer_one_hot = [
            0 for i in range(largest_num_of_words_any_paragraph + 2)
        ]
        answer_one_hot[largest_num_of_words_any_paragraph] = 1
        answer_lookup_dict['unk'] = answer_one_hot
        answer_one_hot = [
            0 for i in range(largest_num_of_words_any_paragraph + 2)
        ]
        answer_one_hot[largest_num_of_words_any_paragraph + 1] = 1
        answer_lookup_dict[''] = answer_one_hot
        feed_dict = {question: questions_words, text: paragraphs_sentences}
        classification = sess.run(answer_softmax, feed_dict)
        print util.get_words(classification, answer_lookup_dict,
                             largest_num_of_words_in_answer)

    while True:
        proceed = raw_input(
            "Do you want to ask another question (q), enter new paragraph (p) or exit (e): "
        )
        if proceed not in ['p', 'q', 'e']:
            print "Invalid input"
        else:
            break

Пример #14

0

Показать файл

Файл: main.py Проект: tkondrashov/thisminute

			LEFT JOIN tweet_properties ON id = tweet_id
			WHERE TO_TIMESTAMP(%s) <= time AND time < TO_TIMESTAMP(%s)
			ORDER BY time ASC
		""", (last_runtime, current_time))

    last_runtime = current_time

    if current_time - last_traintime >= 30:
        pericog.update()
        last_traintime = current_time

    ids = []
    X = []
    for id, timestamp, geolocation, exact, user, text in db_tweets_cursor.fetchall(
    ):
        if not get_words(text):
            continue

        ids.append(id)
        X.append(text)

        db_tweets_cursor.execute(
            """
				INSERT INTO tweet_votes
					(tweet_id, user_ip, disaster)
				VALUES
					(%s, '0.0.0.0', False)
			""", (id, ))

    if X:
        Y = pericog.predict(X)

Пример #15

0

Показать файл

Файл: main.py Проект: harixxy/solutions

def get_bigram_word(raw_query, hot_words, cat):
    words = get_words(raw_query)
    words = [w for w in words if w in hot_words[cat]]
    words.sort()
    bigram = get_pair(words)
    return bigram

Пример #16

0

Показать файл

Файл: util_test.py Проект: datadrivenempathy/who-wrote-this-server

 def test_get_words(self):
     words = util.get_words('test sentence 1.', False)
     self.assertEquals(len(words), 3)
     self.assertEquals(words[0], 'test')
     self.assertEquals(words[1], 'sentence')
     self.assertEquals(words[2], '1')

Пример #17

0

Показать файл

Файл: util_test.py Проект: datadrivenempathy/who-wrote-this-server

 def test_get_words_special_chars(self):
     words = util.get_words('test sentence-1.', False)
     self.assertEquals(len(words), 2)
     self.assertEquals(words[0], 'test')
     self.assertEquals(words[1], 'sentence-1')

Пример #18

0

Показать файл

Файл: main.py Проект: raziakram/solutions

def get_bigram_word(raw_query, hot_words, cat):
    words = get_words(raw_query)
    words = [w for w in words if w in hot_words[cat]]
    words.sort()
    bigram = get_pair(words)
    return bigram

Пример #19

0

Показать файл

Файл: models.py Проект: jbochi/to-a-pe

 def searchable_words(self):
     return get_words(self.search_text())

Python get_words примеры использования