def eval_seq_train(gold, pre, labels, hc = None, features = None): """ evaluate a sequence labeler """ n = len(gold) tp = 0 fp = 0 fn = 0 for i in range(n): (x, y, z) = eval_ner(gold[i], pre[i], labels) tp += x fp += y fn += z if hc != None: if y + z > 0: sen = hc.sentences[i] print "sen: ", util.get_words(sen, features) + " OOV = " + str(has_oov(sen)) print "true labels: ", util.get_lab_name(gold[i], labels) print "predicted: ", util.get_lab_name(pre[i], labels) try: pre = tp * 1.0 / (tp + fp) rec = tp * 1.0 / (tp + fn) f = 2.0 * pre * rec / (pre + rec) print pre, rec, f except ZeroDivisionError: print "DIV BY 0 ", tp, fp, fn
def correct_query(raw_query, lemmatizer, local_cache): raw_query = raw_query.lower().strip() if raw_query in local_cache: return local_cache[raw_query] words = get_words(raw_query) new_words = list() for w in words: split = get_split(w) if type(split) == type(()): new_words.extend(list(split)) else: new_words.append(split) new_query = '' for w in new_words: lemma = lemmatizer.lemmatize(w) if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha(): split = split_word_num(w) if type(split) == type(()): w, num = split lemma = ' '.join([w, num]) new_query += lemma + ' ' new_query = new_query[0:-1] local_cache[raw_query] = new_query #print("New query: {}".format(new_query.encode("utf-8"))) return new_query
def make_predictions(st_line, ed_line, out_file, pname, models): cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[ 0] reader = readfile(new_test_file) writer = writefile(out_file) line_idx = 0 for (user, category, raw_query, click_time) in reader: line_idx += 1 if line_idx < st_line: continue if line_idx > ed_line: break if line_idx % TEST_STEP == 0: print '%s--%d' % (pname, line_idx / TEST_STEP) time_block = get_time_feature(click_time) try: bound = cat_count[category][PREDICT_HOT_SIZE] hots = [x[0] for x in item_sort[category][0:bound]] except: writer.writerow(["0"]) continue try: bigram = get_bigram_word(raw_query, hot_words, category) words = get_words(raw_query) query_size = sum([cat_word[category][w] for w in words]) if query_size >= 100 and len(bigram) > 0: 'only queries hot enough and can generate bigram features can be predicted by boosting model' rank = [[ sku, boosting_bayes(bigram, words, category, sku, alpha, beta, item_word, bigram_item_word, item_count, cat_count, time_cat_item_dict, time_block) ] for sku in hots] elif query_size >= 100 and len(bigram) == 0: 'if hot enough but can not generate bigram features then use naive bayes with time information' rank = [[ sku, time_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count, time_cat_item_dict, time_block) ] for sku in hots] else: 'otherwise use plain naive bayes' rank = [[ sku, plain_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count) ] for sku in hots] rank = sorted(rank, key=lambda x: x[1], reverse=True) guesses = [i[0] for i in rank[0:5]] guesses = rerank_guess(guesses, user, raw_query) writer.writerow([" ".join(guesses)]) except (TypeError, KeyError): # a category we haven't seen before writer.writerow([" ".join(hots[0:5])])
def get_title_words(self, dedupe=True): """Get the words (lowercase) from the title. Args: dedupe: Flag indicating if only unique words should be returned. If True, only unique words will be returned (in no particular order). If False, all words found will be returned in original order with duplicates. Returns: Iterable over strings representing the words found in the title. """ return util.get_words(self.get_title(), dedupe=dedupe)
def song_search(request): query = request.path.split("/")[-1] res = song.search(sort='song_hotttnesss-desc',combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0] lid = res.get_foreign_id('lyricfind-US').split(":")[-1] lyrics = util.get_lyrics(lid) words = util.clean_lyrics(lyrics) words = util.get_words(words) words = util.remove_common_verbs(words) sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=True)[0:10] sorted_lyric_counts = [x[1] for x in sorted_lyric_data] sorted_lyrics = [x[0] for x in sorted_lyric_data] response = {'artist' : res.artist_name, 'verbs' : lyrics, 'answer' : res.title, 'sorted_verbs' : sorted_lyrics, 'sorted_verb_counts' : sorted_lyric_counts} return render_to_response("song_results.html",response)
def query(): """Query for prototypical articles within a topic (using "search" url param). Returns: JSON listing of prototypical records for the given topic. """ query_string = flask.request.args.get('search') keywords = util.get_words(query_string) report_maybe('query', query_string) records = records_keep.query(keywords) records_serial = list( sorted(map(model.serialize_record_to_dict, records), key=lambda x: x['source'])) return json.dumps({'records': records_serial})
def get_unigram_model(item_sort, cat_count): reader = readfile(new_train_file) item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) cat_word = defaultdict(lambda: defaultdict(int)) idx = 0 for (__user, sku, category, raw_query, ___click_time) in reader: idx += 1 bound = cat_count[category][HOT_SIZE] popular = [i[0] for i in item_sort[category][0:bound]] if sku in popular: words = get_words(raw_query) for w in words: item_word[category][sku][w] += magic_num cat_word[category][w] += magic_num return item_word, cat_word
def api_song_search(request): query = request.path.split("/")[-1] try: res = song.search(combined=query,results=1,buckets=['id:lyricfind-US'], limit=True)[0] lid = res.get_foreign_id('lyricfind-US').split(":")[-1] lyrics = util.get_lyrics(lid) words = util.clean_lyrics(lyrics) words = util.get_words(words) words = util.remove_common_words(words) sorted_lyric_data = sorted(words.iteritems(), key=itemgetter(1), reverse=False)[-20:] sorted_lyric_counts = [x[1] for x in sorted_lyric_data] sorted_lyrics = [x[0] for x in sorted_lyric_data] sorted_lyric_combined = [[x[1],x[0]] for x in sorted_lyric_data] response = {'artist' : res.artist_name, 'title' : res.title, 'sorted_words' : sorted_lyric_combined} except Exception, e: response = {'error' : str(e)}
def eval_hc_test(hc, features, labels, print_err=False, decoder='hc'): """ evaluate in the train set :param hc: :param labels: :return: """ tp = 0 fp = 0 fn = 0 dirname = "testa" input = [] for file in os.listdir(dirname): # print file if file.endswith(".txt"): f = open(os.path.join(dirname, file)) l = list(f) input.extend(l) f.close() # return input sentences = util.extract(input, features, labels, keep_word = True) # return sentences for sen in sentences: if True: # if not has_oov(sen): #predicted = hc.decode(util.get_obs(sen)) predicted = get_tag(hc, sen, features, decoder) (x, y, z) = eval_ner(util.get_lab(sen), predicted, labels) tp += x fp += y fn += z if print_err: if y + z > 0: print "sen: ", util.get_words(sen, features) + " OOV = " + str(has_oov(sen)) print "true labels: ", util.get_lab_name(util.get_lab(sen), labels) print "predicted: ", util.get_lab_name(predicted, labels) try: pre = tp * 1.0 / (tp + fp) rec = tp * 1.0 / (tp + fn) f = 2.0 * pre * rec / (pre + rec) print pre, rec, f except ZeroDivisionError: print "DIV BY 0 ", tp, fp, fn
def make_predictions(st_line, ed_line, out_file, pname, models): cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, hot_words = models[0] reader = readfile(new_test_file) writer = writefile(out_file) line_idx = 0 for (user, category, raw_query, click_time) in reader: line_idx += 1 if line_idx < st_line: continue if line_idx > ed_line: break if line_idx % TEST_STEP == 0: print '%s--%d' % (pname, line_idx / TEST_STEP) time_block = get_time_feature(click_time) try: bound = cat_count[category][PREDICT_HOT_SIZE] hots = [x[0] for x in item_sort[category][0:bound]] except: writer.writerow(["0"]) continue try: bigram = get_bigram_word(raw_query, hot_words, category) words = get_words(raw_query) query_size = sum([cat_word[category][w] for w in words]) if query_size >= 100 and len(bigram) > 0: 'only queries hot enough and can generate bigram features can be predicted by boosting model' rank = [[sku, boosting_bayes(bigram, words, category, sku, alpha, beta, item_word, bigram_item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots] elif query_size >= 100 and len(bigram) == 0: 'if hot enough but can not generate bigram features then use naive bayes with time information' rank = [[sku, time_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots] else: 'otherwise use plain naive bayes' rank = [[sku, plain_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count)] for sku in hots] rank = sorted(rank, key=lambda x:x[1], reverse=True) guesses = [i[0] for i in rank[0:5]] guesses = rerank_guess(guesses, user, raw_query) writer.writerow([" ".join(guesses)]) except (TypeError, KeyError): # a category we haven't seen before writer.writerow([" ".join(hots[0:5])])
def correct_query(raw_query, lemmatizer, local_cache): raw_query = raw_query.lower().strip() if raw_query in local_cache: return local_cache[raw_query] words = get_words(raw_query) new_words = list() for w in words: split = get_split(w) if type(split) == type(()): new_words.extend(list(split)) else: new_words.append(split) new_query = '' for w in new_words: lemma = lemmatizer.lemmatize(w) if len(lemma) >= 4 and not lemma.isdigit() and not lemma.isalpha(): split = split_word_num(w) if type(split) == type(()): w, num = split lemma = ' '.join([w, num]) new_query += lemma + ' ' new_query = new_query[0:-1] local_cache[raw_query] = new_query return new_query
# if embedding == 1: # print h # if h==56: # print "vvww" # h = h + 1 # print word #print ("len: ", x) answer_one_hot = [ 0 for i in range(largest_num_of_words_any_paragraph + 2) ] answer_one_hot[largest_num_of_words_any_paragraph] = 1 answer_lookup_dict['unk'] = answer_one_hot answer_one_hot = [ 0 for i in range(largest_num_of_words_any_paragraph + 2) ] answer_one_hot[largest_num_of_words_any_paragraph + 1] = 1 answer_lookup_dict[''] = answer_one_hot feed_dict = {question: questions_words, text: paragraphs_sentences} classification = sess.run(answer_softmax, feed_dict) print util.get_words(classification, answer_lookup_dict, largest_num_of_words_in_answer) while True: proceed = raw_input( "Do you want to ask another question (q), enter new paragraph (p) or exit (e): " ) if proceed not in ['p', 'q', 'e']: print "Invalid input" else: break
LEFT JOIN tweet_properties ON id = tweet_id WHERE TO_TIMESTAMP(%s) <= time AND time < TO_TIMESTAMP(%s) ORDER BY time ASC """, (last_runtime, current_time)) last_runtime = current_time if current_time - last_traintime >= 30: pericog.update() last_traintime = current_time ids = [] X = [] for id, timestamp, geolocation, exact, user, text in db_tweets_cursor.fetchall( ): if not get_words(text): continue ids.append(id) X.append(text) db_tweets_cursor.execute( """ INSERT INTO tweet_votes (tweet_id, user_ip, disaster) VALUES (%s, '0.0.0.0', False) """, (id, )) if X: Y = pericog.predict(X)
def get_bigram_word(raw_query, hot_words, cat): words = get_words(raw_query) words = [w for w in words if w in hot_words[cat]] words.sort() bigram = get_pair(words) return bigram
def test_get_words(self): words = util.get_words('test sentence 1.', False) self.assertEquals(len(words), 3) self.assertEquals(words[0], 'test') self.assertEquals(words[1], 'sentence') self.assertEquals(words[2], '1')
def test_get_words_special_chars(self): words = util.get_words('test sentence-1.', False) self.assertEquals(len(words), 2) self.assertEquals(words[0], 'test') self.assertEquals(words[1], 'sentence-1')
def searchable_words(self): return get_words(self.search_text())