Пример #1
0
def eval_seq_train(gold, pre, labels, hc = None, features = None):
    """
    evaluate a sequence labeler
    """
    n = len(gold)
    tp = 0
    fp = 0
    fn = 0

    for i in range(n):
        (x, y, z) = eval_ner(gold[i], pre[i], labels)
        tp += x
        fp += y
        fn += z
        if hc != None:
            if y + z > 0:
                sen = hc.sentences[i]
                print "sen: ",  util.get_words(sen, features) + " OOV = " + str(has_oov(sen))
                print "true labels: ", util.get_lab_name(gold[i], labels)
                print "predicted: ", util.get_lab_name(pre[i], labels)

    try:
        pre = tp * 1.0 / (tp + fp)
        rec = tp * 1.0 / (tp + fn)
        f = 2.0 * pre * rec / (pre + rec)
        print pre, rec, f
    except ZeroDivisionError:
        print "DIV BY 0 ", tp, fp, fn
Пример #2
0
def correct_query(raw_query, lemmatizer, local_cache):
    raw_query = raw_query.lower().strip()
    if raw_query in local_cache:
        return local_cache[raw_query]
    words = get_words(raw_query)
    new_words = list()
    for w in words:
        split = get_split(w)
        if type(split) == type(()):
            new_words.extend(list(split))
        else:
            new_words.append(split)
    new_query = ''
    for w in new_words:
        lemma = lemmatizer.lemmatize(w)
        if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha():
            split = split_word_num(w)
            if type(split) == type(()):
                w, num = split
                lemma = ' '.join([w, num])
        new_query += lemma + ' '
    new_query = new_query[0:-1]
    local_cache[raw_query] = new_query
    #print("New query: {}".format(new_query.encode("utf-8")))
    return new_query
Пример #3
0
def make_predictions(st_line, ed_line, out_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[
        0]
    reader = readfile(new_test_file)
    writer = writefile(out_file)
    line_idx = 0
    for (user, category, raw_query, click_time) in reader:
        line_idx += 1
        if line_idx < st_line:
            continue
        if line_idx > ed_line:
            break
        if line_idx % TEST_STEP == 0:
            print '%s--%d' % (pname, line_idx / TEST_STEP)
        time_block = get_time_feature(click_time)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            writer.writerow(["0"])
            continue
        try:
            bigram = get_bigram_word(raw_query, hot_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                'only queries hot enough and can generate bigram features can be predicted by boosting model'
                rank = [[
                    sku,
                    boosting_bayes(bigram, words, category, sku, alpha, beta,
                                   item_word, bigram_item_word, item_count,
                                   cat_count, time_cat_item_dict, time_block)
                ] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                'if hot enough but can not generate bigram features then use naive bayes with time information'
                rank = [[
                    sku,
                    time_bayes_query_prediction(words, category, sku, alpha,
                                                beta, item_word, item_count,
                                                cat_count, time_cat_item_dict,
                                                time_block)
                ] for sku in hots]
            else:
                'otherwise use plain naive bayes'
                rank = [[
                    sku,
                    plain_bayes_query_prediction(words, category, sku, alpha,
                                                 beta, item_word, item_count,
                                                 cat_count)
                ] for sku in hots]
            rank = sorted(rank, key=lambda x: x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rerank_guess(guesses, user, raw_query)

            writer.writerow([" ".join(guesses)])
        except (TypeError, KeyError):  # a category we haven't seen before
            writer.writerow([" ".join(hots[0:5])])
    def get_title_words(self, dedupe=True):
        """Get the words (lowercase) from the title.

        Args:
            dedupe: Flag indicating if only unique words should be returned. If True, only unique
                words will be returned (in no particular order). If False, all words found will
                be returned in original order with duplicates.
        Returns:
            Iterable over strings representing the words found in the title.
        """
        return util.get_words(self.get_title(), dedupe=dedupe)
Пример #5
0
def song_search(request):
    query = request.path.split("/")[-1]
    res = song.search(sort='song_hotttnesss-desc',combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0]
    lid = res.get_foreign_id('lyricfind-US').split(":")[-1]
    lyrics = util.get_lyrics(lid)
    words = util.clean_lyrics(lyrics)
    words = util.get_words(words)
    words = util.remove_common_verbs(words)
    sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=True)[0:10]
    sorted_lyric_counts = [x[1] for x in sorted_lyric_data]
    sorted_lyrics = [x[0] for x in sorted_lyric_data]
    response = {'artist' : res.artist_name, 'verbs' : lyrics, 'answer' : res.title, 'sorted_verbs' : sorted_lyrics, 'sorted_verb_counts' : sorted_lyric_counts}
    return render_to_response("song_results.html",response)
    def query():
        """Query for prototypical articles within a topic (using "search" url param).

        Returns:
            JSON listing of prototypical records for the given topic.
        """
        query_string = flask.request.args.get('search')
        keywords = util.get_words(query_string)
        report_maybe('query', query_string)
        records = records_keep.query(keywords)
        records_serial = list(
            sorted(map(model.serialize_record_to_dict, records),
                   key=lambda x: x['source']))
        return json.dumps({'records': records_serial})
Пример #7
0
def get_unigram_model(item_sort, cat_count):
    reader = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    idx = 0
    for (__user, sku, category, raw_query, ___click_time) in reader:
        idx += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word
Пример #8
0
def get_unigram_model(item_sort, cat_count):
    reader = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    idx = 0
    for (__user, sku, category, raw_query, ___click_time) in reader:
        idx += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word
Пример #9
0
def api_song_search(request):
    query = request.path.split("/")[-1]
    try:
        res = song.search(combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0]
        lid = res.get_foreign_id('lyricfind-US').split(":")[-1]
        lyrics = util.get_lyrics(lid)
        words = util.clean_lyrics(lyrics)
        words = util.get_words(words)
        words = util.remove_common_words(words)
        sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=False)[-20:]
        sorted_lyric_counts = [x[1] for x in sorted_lyric_data]
        sorted_lyrics = [x[0] for x in sorted_lyric_data]
        sorted_lyric_combined = [[x[1],x[0]] for x in sorted_lyric_data]
        response = {'artist' : res.artist_name, 'title' : res.title, 'sorted_words' : sorted_lyric_combined}
    except Exception, e:
        response = {'error' : str(e)}
Пример #10
0
def eval_hc_test(hc, features, labels, print_err=False, decoder='hc'):
    """
    evaluate in the train set
    :param hc:
    :param labels:
    :return:
    """
    tp = 0
    fp = 0
    fn = 0

    dirname = "testa"
    input = []
    for file in os.listdir(dirname):
        # print file
        if file.endswith(".txt"):
            f = open(os.path.join(dirname, file))
            l = list(f)
            input.extend(l)
            f.close()
    # return input
    sentences = util.extract(input, features, labels, keep_word = True)

    # return sentences

    for sen in sentences:
        if True:
            # if not has_oov(sen):
            #predicted = hc.decode(util.get_obs(sen))
            predicted = get_tag(hc, sen, features, decoder)
            (x, y, z) = eval_ner(util.get_lab(sen), predicted, labels)
            tp += x
            fp += y
            fn += z
            if print_err:
                if y + z > 0:
                    print "sen: ",  util.get_words(sen, features) + " OOV = " + str(has_oov(sen))
                    print "true labels: ", util.get_lab_name(util.get_lab(sen), labels)
                    print "predicted: ", util.get_lab_name(predicted, labels)

    try:
        pre = tp * 1.0 / (tp + fp)
        rec = tp * 1.0 / (tp + fn)
        f = 2.0 * pre * rec / (pre + rec)
        print pre, rec, f
    except ZeroDivisionError:
        print "DIV BY 0 ", tp, fp, fn
Пример #11
0
def make_predictions(st_line, ed_line, out_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[0]
    reader = readfile(new_test_file)
    writer = writefile(out_file)
    line_idx = 0
    for (user, category, raw_query, click_time) in reader:
        line_idx += 1
        if line_idx < st_line:
            continue
        if line_idx > ed_line:
            break
        if line_idx % TEST_STEP == 0:
            print '%s--%d' % (pname, line_idx / TEST_STEP)
        time_block = get_time_feature(click_time)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            writer.writerow(["0"])
            continue
        try:
            bigram = get_bigram_word(raw_query, hot_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                'only queries hot enough and can generate bigram features can be predicted by boosting model'
                rank = [[sku, boosting_bayes(bigram, words, category, sku, alpha, beta, item_word, bigram_item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                'if hot enough but can not generate bigram features then use naive bayes with time information'
                rank = [[sku, time_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            else:
                'otherwise use plain naive bayes'
                rank = [[sku, plain_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count)] for sku in hots]
            rank = sorted(rank, key=lambda x:x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rerank_guess(guesses, user, raw_query)
            
            writer.writerow([" ".join(guesses)])
        except (TypeError, KeyError): # a category we haven't seen before
            writer.writerow([" ".join(hots[0:5])])
Пример #12
0
def correct_query(raw_query, lemmatizer, local_cache):
    raw_query = raw_query.lower().strip()
    if raw_query in local_cache:
        return local_cache[raw_query]
    words = get_words(raw_query)
    new_words = list()
    for w in words:
        split = get_split(w)
        if type(split) == type(()):
            new_words.extend(list(split))
        else:
            new_words.append(split)
    new_query = ''
    for w in new_words:
        lemma = lemmatizer.lemmatize(w)
        if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha():
            split = split_word_num(w)
            if type(split) == type(()):
                w, num = split
                lemma = ' '.join([w, num])
        new_query += lemma + ' '
    new_query = new_query[0:-1]
    local_cache[raw_query] = new_query
    return new_query
Пример #13
0
        #		if embedding == 1:
        #			print h
        #			if h==56:
        #				print "vvww"
        #		h = h + 1
        #	print word
        #print ("len: ", x)
        answer_one_hot = [
            0 for i in range(largest_num_of_words_any_paragraph + 2)
        ]
        answer_one_hot[largest_num_of_words_any_paragraph] = 1
        answer_lookup_dict['unk'] = answer_one_hot
        answer_one_hot = [
            0 for i in range(largest_num_of_words_any_paragraph + 2)
        ]
        answer_one_hot[largest_num_of_words_any_paragraph + 1] = 1
        answer_lookup_dict[''] = answer_one_hot
        feed_dict = {question: questions_words, text: paragraphs_sentences}
        classification = sess.run(answer_softmax, feed_dict)
        print util.get_words(classification, answer_lookup_dict,
                             largest_num_of_words_in_answer)

    while True:
        proceed = raw_input(
            "Do you want to ask another question (q), enter new paragraph (p) or exit (e): "
        )
        if proceed not in ['p', 'q', 'e']:
            print "Invalid input"
        else:
            break
Пример #14
0
			LEFT JOIN tweet_properties ON id = tweet_id
			WHERE TO_TIMESTAMP(%s) <= time AND time < TO_TIMESTAMP(%s)
			ORDER BY time ASC
		""", (last_runtime, current_time))

    last_runtime = current_time

    if current_time - last_traintime >= 30:
        pericog.update()
        last_traintime = current_time

    ids = []
    X = []
    for id, timestamp, geolocation, exact, user, text in db_tweets_cursor.fetchall(
    ):
        if not get_words(text):
            continue

        ids.append(id)
        X.append(text)

        db_tweets_cursor.execute(
            """
				INSERT INTO tweet_votes
					(tweet_id, user_ip, disaster)
				VALUES
					(%s, '0.0.0.0', False)
			""", (id, ))

    if X:
        Y = pericog.predict(X)
Пример #15
0
def get_bigram_word(raw_query, hot_words, cat):
    words = get_words(raw_query)
    words = [w for w in words if w in hot_words[cat]]
    words.sort()
    bigram = get_pair(words)
    return bigram
 def test_get_words(self):
     words = util.get_words('test sentence 1.', False)
     self.assertEquals(len(words), 3)
     self.assertEquals(words[0], 'test')
     self.assertEquals(words[1], 'sentence')
     self.assertEquals(words[2], '1')
 def test_get_words_special_chars(self):
     words = util.get_words('test sentence-1.', False)
     self.assertEquals(len(words), 2)
     self.assertEquals(words[0], 'test')
     self.assertEquals(words[1], 'sentence-1')
Пример #18
0
def get_bigram_word(raw_query, hot_words, cat):
    words = get_words(raw_query)
    words = [w for w in words if w in hot_words[cat]]
    words.sort()
    bigram = get_pair(words)
    return bigram
Пример #19
0
 def searchable_words(self):
     return get_words(self.search_text())