def find_missing_word(model, line): words = tokenize_words(line) if len(words) <= 2: return 1 scores = list(p for p, _, _ in model.full_scores(line)) # missing word cannot be the first or last, per rules idx = np.argmin(scores[1:-2]) + 1 return idx
def missing_word_index(sentence, ref_sentence, lo=0, hi=None): ''' Use bisection search to find the location of the missing word in @sentence with respect to @ref_sentence. Return the index of the missing word in @ref_sentence. ''' words = tokenize_words(sentence) ref_words = tokenize_words(ref_sentence) assert len(words) == len(ref_words) - 1 lo = lo if lo is not None else 0 hi = hi if hi is not None else len(ref_words) i = (lo + hi) / 2 while lo+1 < hi: if words[i] == ref_words[i]: lo = i else: hi = i i = (lo + hi) / 2 if i < len(words) and words[i] == ref_words[i]: i += 1 assert words[i-1] == ref_words[i-1] assert words[i] != ref_words[i] or i == len(words) return i
def letter_frequencies(istream, n=1): counts = defaultdict(int) nwords = 0 for i, line in enumerate(istream): words = tokenize_words(line) nwords += len(words) for word in words: #for letter in set(window(word,n)): for letter in set(word): counts[letter] += 1 if i % PROGRESS == 0: print >> sys.stderr, i # Normalize counts to total number of words for k in counts.keys(): counts[k] /= float(nwords) return counts
def letter_frequencies(istream, n=1): counts = defaultdict(int) nwords = 0 for i, line in enumerate(istream): words = tokenize_words(line) nwords += len(words) for word in words: #for letter in set(window(word,n)): for letter in set(word): counts[letter] += 1 if i % PROGRESS == 0: print >>sys.stderr, i # Normalize counts to total number of words for k in counts.keys(): counts[k] /= float(nwords) return counts
def make_mishnaic_training_context(): training = [] mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")] mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")] for ind in mishnah_indexes: mishna_segs = ind.all_section_refs() for seg in mishna_segs: first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True) training += [{'language':'mishnaic', 'phrase': util.tokenize_words(p)} for p in first_sec_str.split(u'. ')] total_words = 0 total_phrases = len(training) for p in training: total_words += len(p['phrase']) print 'NUM PHRASES: {} AVG WORDS PER PHRASE: {}'.format(total_phrases,total_words/total_phrases) return training
def find_missing_word(model, vocab, line, n): ''' Return the location and word that maximizes the sentence probability if that word is inserted at that location ''' words = tokenize_words(line) if len(words) <= 2: return max_prob_word_at(words, 1, vocab) # missing word cannot be the first or last top_n = TopK(n) for i in range(1, len(words) - 1): #print >>sys.stderr, "Considering words inserted at %d:" % i top_n_i = max_prob_word_at(words, i, vocab, n) #print_top_n(top_n_i) top_n.update(top_n_i) #print >>sys.stderr, "Current best:" #print_top_n(top_n) return top_n
def find_missing_word(model, vocab, line): ''' Return the location and word that maximizes the sentence probability if that word is inserted at that location ''' words = tokenize_words(line) if len(words) <= 2: best, _ = max_prob_word_at(words, 1, vocab) return 1, best # missing word cannot be the first or last max_p = -float('inf') best = None for i in xrange(1, len(words) - 1): i_best, i_max_p = max_prob_word_at(words, i, vocab) if i_max_p > max_p: max_p = i_max_p best = (i, i_best) return best
def find_missing_word(model, vocab, line, n): ''' Return the location and word that maximizes the sentence probability if that word is inserted at that location ''' words = tokenize_words(line) if len(words) <= 2: return max_prob_word_at(words, 1, vocab) # missing word cannot be the first or last top_n = TopK(n) for i in xrange(1, len(words)-1): #print >>sys.stderr, "Considering words inserted at %d:" % i top_n_i = max_prob_word_at(words, i, vocab, n) #print_top_n(top_n_i) top_n.update(top_n_i) #print >>sys.stderr, "Current best:" #print_top_n(top_n) return top_n
def find_missing_word(model, vocab, line): ''' Return the location and word that maximizes the sentence probability if that word is inserted at that location ''' words = tokenize_words(line) if len(words) <= 2: best, _ = max_prob_word_at(words, 1, vocab) return 1, best # missing word cannot be the first or last max_p = -float('inf') best = None for i in xrange(1, len(words)-1): i_best, i_max_p = max_prob_word_at(words, i, vocab) if i_max_p > max_p: max_p = i_max_p best = (i, i_best) return best
def preprocess(self): file = open("../data/data.txt").read() self.processed_inputs = tokenize_words(file) self.chars = sorted(list(set(self.processed_inputs))) char_to_num = dict((c, i) for i, c in enumerate(self.chars)) input_len = len(self.processed_inputs) self.vocab_len = len(self.chars) print("Total number of characters:", input_len) print("Total vocab:", self.vocab_len) for i in range(0, input_len - self.seq_length, 1): in_seq = self.processed_inputs[i:i + self.seq_length] out_seq = self.processed_inputs[i + self.seq_length] self.x_data.append([char_to_num[char] for char in in_seq]) self.y_data.append(char_to_num[out_seq]) n_patterns = len(self.x_data) self.X = np.reshape(self.x_data, (n_patterns, self.seq_length, 1)) self.X = self.X / float(self.vocab_len) self.y = tf.keras.utils.to_categorical(self.y_data)
def remove_random_word(line): ''' Remove a random word from line, attempting to remove contractions whole and not punctuation / numbers. ''' words = tokenize_words(line) choices = removable_words(words) if len(choices) == 0: return line selected = random.choice(choices) if words[selected].startswith("'"): # second part of possessive/contraction words.pop(selected) words.pop(selected - 1) elif selected + 1 < len(words) and words[selected + 1].startswith("'"): # first part of possessive/contraction words.pop(selected + 1) words.pop(selected) else: # regular word words.pop(selected) return ' '.join(words)
def make_mishnaic_training(): training = [] num_mishnah_per_mesechta = 30000 # effectively all mishnah mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")] mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")] mish_set = set() num_removed = 0 for ind in mishnah_indexes: mishna_segs = ind.all_section_refs() if len(mishna_segs) >= num_mishnah_per_mesechta: mishna_segs = mishna_segs[:num_mishnah_per_mesechta] for seg in mishna_segs: first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True) word_list = util.tokenize_words(first_sec_str) for word in word_list: if random.random() > 0.45 and word in mish_set: num_removed += 1 continue training.append({'word':word,'tag':'mishnaic'}) mish_set.add(word) print "Num Mishna removed: {}".format(num_removed) return training
def load_mapping(istream): mapping = {} for line in istream: from_word, to_word = line.rstrip().split() mapping[from_word] = to_word return mapping def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('sentences', type=argparse.FileType('r'), help='File with sentences') parser.add_argument('mapping', type=argparse.FileType('r'), help='File with word map') return parser if __name__ == "__main__": args = opts().parse_args() mapping = load_mapping(args.mapping) for i, sentence in enumerate(args.sentences): words = [mapping.get(w, UNKNOWN) for w in tokenize_words(sentence)] print ' '.join(words) if i % 500000 == 0: print >> sys.stderr, i
#!/usr/bin/env python '''Replace words with their word2vec class''' import sys, argparse from util import tokenize_words, load_vocab, UNKNOWN def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('classes', type=argparse.FileType('r'), help='File with word2vec classes') return parser if __name__ == "__main__": args = opts().parse_args() print >>sys.stderr, "Loading word2vec classes" vocab = load_vocab(args.classes) for i, line in enumerate(sys.stdin): words = [vocab.get(w, UNKNOWN) for w in tokenize_words(line)] print ' '.join(map(str, words)) if i % 100000 == 0: print >>sys.stderr, i
#!/usr/bin/env python ''' Insert blanks like madlib in place of removed words ''' import sys, argparse from itertools import izip from util import tokenize_words def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('sample', type=argparse.FileType('r'), help='Sentences with one missing word') parser.add_argument('removed', type=argparse.FileType('r'), help='File with predicted indices of missing words') return parser if __name__ == "__main__": args = opts().parse_args() for sentence, i_missing in izip(args.sample, args.removed): words = tokenize_words(sentence) i_missing = int(i_missing) words.insert(i_missing, ' ') print ' '.join(words)
parser.add_argument('classifier', type=argparse.FileType('r'), help='Input pickle file with classifier to re-use') parser.add_argument('predictions', type=argparse.FileType('r'), help='Input file with predicted words and locations') return parser if __name__ == "__main__": args = opts().parse_args() print >>sys.stderr, "Loading test data" X = load(args.data) X = np.asarray(X, dtype=np.float32) X = np.nan_to_num(X) print >>sys.stderr, "Loading classifer" clf = load_classifier(args.classifier) print >>sys.stderr, "Predicting decisions" d = clf.predict(X) print >>sys.stderr, "Performing decisions on stdin" for di, line, pred in izip(d, sys.stdin, args.predictions): pred = Prediction.parse(pred) words = tokenize_words(line) if di == 0: # do nothing pass elif di == 1: # insert space words.insert(pred.location, ' ') else: # insert word words.insert(pred.location, pred.word) print ' '.join(words)
def num_tokens(line): words = tokenize_words(line) return len(words)
help='Gold-standard POS-tagged sentences') parser.add_argument('errors', type=argparse.FileType('w'), help='Pickle file with errors broken down by POS tag') return parser if __name__ == "__main__": args = opts().parse_args() counts = defaultdict(lambda: defaultdict(int)) nerrors = 0 nsentences = 0 for sentence, ref_sentence, i_removed in izip(args.sample, args.gold, args.removed): try: i_removed = int(i_removed) words = tokenize_words(sentence) ref_words = tokenize_words(ref_sentence) assert len(words) == len(ref_words)-1 pos = map(pos_tag, words) ref_pos = map(pos_tag, ref_words) has_error = False for i in xrange(i_removed): counts[pos[i]][ref_pos[i]] += 1 has_error |= (pos[i] != ref_pos[i]) for i in xrange(i_removed, len(words)): counts[pos[i]][ref_pos[i+1]] += 1 has_error |= (pos[i] != ref_pos[i+1]) if has_error: nerrors += 1 sys.stdout.write(ref_sentence)
def match_cal_segments(mesechta): def merge_cal_word_objs(s, e, word_obj_list): obj_list = word_obj_list[s:e] m_word = u" ".join([o["word"] for o in obj_list]) m_head_word = u" ".join([o["head_word"] for o in obj_list]) m_pos_list = [o["POS"] for o in obj_list] m_pos = max(set(m_pos_list), key=m_pos_list.count) new_obj = obj_list[0].copy() new_obj["word"] = m_word new_obj["head_word"] = m_head_word new_obj["POS"] = m_pos return [ new_obj ] #returns a single element array which will replace a range s:e in the original array cal_lines = json.load(open( "data/1_cal_input/cal_lines_{}.json".format(mesechta), "r"), encoding="utf8") #cal_pos_hashtable = json.load(open("cal_pos_hashtable.json","r"),encoding='utf8') dafs = cal_lines["dafs"] lines_by_daf = cal_lines["lines"] super_base_ref = Ref(mesechta) subrefs = super_base_ref.all_subrefs() ical = 0 num_sef_words = 0 num_cal_words = 0 num_words_matched = 0 for curr_sef_ref in subrefs: if curr_sef_ref.is_empty(): continue if ical >= len(dafs): break daf = dafs[ical] print "-----{} DAF {} ({}/{})-----".format(mesechta, daf, ical, len(dafs)) base_tc = TextChunk(curr_sef_ref, "he", "William Davidson Edition - Aramaic") bas_word_list = [] # re.split(r"\s+"," ".join(base_text.text)) for segment in base_tc.text: bas_word_list += util.tokenize_words(segment) temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list] lines = [[word_obj["word"] for word_obj in temp_line] for temp_line in lines_by_daf[ical]] word_obj_list = [ word_obj for temp_line in lines_by_daf[ical] for word_obj in temp_line ] lines_by_str = [u' '.join(line_array) for line_array in lines] curr_cal_ref = Ref("{} {}".format(mesechta, daf)) out = [] word_for_word_se = [] cal_words = [] missed_words = [] global_offset = 0 if curr_sef_ref == curr_cal_ref: matched = dibur_hamatchil_matcher.match_text( bas_word_list, lines_by_str, verbose=True, word_threshold=0.27, char_threshold=0.6, with_abbrev_matches=True, with_num_abbrevs=False) start_end_map = matched["matches"] abbrev_matches = matched["abbrevs"] abbrev_ranges = [[am.rashiRange for am in am_list] for am_list in abbrev_matches] print u' --- '.join( [unicode(am) for am_list in abbrev_matches for am in am_list]) abbrev_count = 0 for ar in abbrev_ranges: abbrev_count += len(ar) #if abbrev_count > 0: # print "GRESATLJL THNA DZEOR", abbrev_ranges for iline, se in enumerate(start_end_map): curr_cal_line = lines[iline] # if there is an expanded abbrev, concat those words into one element if len(abbrev_ranges[iline]) > 0: offset = 0 # account for the fact that you're losing elements in the array as you merge them abbrev_ranges[iline].sort(key=lambda x: x[0]) for ar in abbrev_ranges[iline]: if ar[1] - ar[0] <= 0: continue #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length #redefine ar by how many actual words are in the range, not just how many elements start_ar = ar[0] i_abbrev = start_ar num_words = 0 while i_abbrev < len(curr_cal_line): temp_w = curr_cal_line[i_abbrev] num_words += len(re.split(ur'\s+', temp_w)) if num_words >= (ar[1] - ar[0] + 1): break i_abbrev += 1 end_ar = i_abbrev ar = (start_ar, end_ar) if len(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] ) != len( word_obj_list[ar[0] - offset + len(cal_words):ar[1] + 1 - offset + len(cal_words)]): #something's wrong. not sure what, but best to ignore this continue print u"ABBREV RANGE {} --- OFFSET {}".format( ar, offset) print u"CURR CAL LINE BEFORE {}".format(u','.join( curr_cal_line[ar[0] - offset:ar[1] + 1 - offset])) curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] = [ u' '.join(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]) ] print u"CURR CAL LINE AFTER {}".format( curr_cal_line[ar[0] - offset]) print u"WORD OBJ LIST BEFORE {}".format(u','.join([ u'({})'.format(obj['word']) for obj in merge_cal_word_objs( ar[0] - offset + len(cal_words), ar[1] + 1 - offset + len(cal_words), word_obj_list) ])) word_obj_list[ar[0] - offset + len(cal_words):ar[1] + 1 - offset + len(cal_words)] = merge_cal_word_objs( ar[0] - offset + len(cal_words), ar[1] + 1 - offset + len(cal_words), word_obj_list) print u"WORD OBJ LIST AFTER {}".format( word_obj_list[ar[0] - offset + len(cal_words)]['word']) offset += ar[1] - ar[0] global_offset += offset cal_words += curr_cal_line if se[0] == -1: word_for_word_se += [(-1, -1) for i in range(len(curr_cal_line))] continue # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0) curr_bas_line = bas_word_list[se[0]:se[1] + 1] #print u'base line',u' '.join(curr_bas_line) matched_obj_words_base = dibur_hamatchil_matcher.match_text( curr_bas_line, curr_cal_line, char_threshold=0.35, verbose=False, with_num_abbrevs=False) matched_words_base = matched_obj_words_base["matches"] word_for_word_se += [(tse[0] + se[0], tse[1] + se[0]) if tse[0] != -1 else tse for tse in matched_words_base] matched_word_for_word_obj = dibur_hamatchil_matcher.match_text( bas_word_list, cal_words, char_threshold=0.35, prev_matched_results=word_for_word_se, boundaryFlexibility=2, with_num_abbrevs=False) matched_word_for_word = matched_word_for_word_obj["matches"] cal_len = len(matched_word_for_word) bad_word_offset = 0 for ical_word, temp_se in enumerate(matched_word_for_word): if temp_se[0] == -1: missed_words.append({ "word": word_obj_list[ical_word]["word"], "index": ical_word }) continue #dictionary juggling... for i in xrange(temp_se[0], temp_se[1] + 1): #in case a cal_words and word_obj_list aren't the same length bc a word got split up """ if cal_words[ical_word] != word_obj_list[ical_word-bad_word_offset]["word"]: if ical_word+1 < len(cal_words) and cal_words[ical_word+1] != word_obj_list[ical_word-bad_word_offset+1]["word"]: bad_word_offset += 1 continue """ cal_word_obj = word_obj_list[ical_word].copy() cal_word_obj["cal_word"] = cal_word_obj["word"] temp_sef_word = temp_out[i]["word"] temp_out[i] = cal_word_obj temp_out[i]["class"] = "talmud" temp_out[i]["word"] = temp_sef_word print u"\n-----\nFOUND {}/{} ({}%)".format( cal_len - len(missed_words), cal_len, (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100) #print u"MISSED: {}".format(u" ,".join([u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words])) ical += 1 num_cal_words += cal_len num_words_matched += (cal_len - len(missed_words)) """ #tag 1 pos words if still untagged for iwo,word_obj in enumerate(temp_out): word = word_obj["word"] if word in cal_pos_hashtable: if len(cal_pos_hashtable[word]) == 1: temp_out[iwo] = {"word":word,"cal_word":word,"class":"talmud","POS":cal_pos_hashtable[word][0]} """ num_sef_words += len(temp_out) out += temp_out sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta), "").encode('utf8') doc = {"words": out, "missed_words": missed_words} util.make_folder_if_need_be( "data/2_matched_sefaria/json/{}".format(mesechta)) fp = codecs.open("data/2_matched_sefaria/json/{}/{}.json".format( mesechta, sef_daf), "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False) fp.close() return num_sef_words, num_cal_words, num_words_matched
def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('sentences', type=argparse.FileType('r'), help='File with sentences with <unk>') parser.add_argument('pos', type=argparse.FileType('r'), help='File with POS tags') return parser if __name__ == "__main__": args = opts().parse_args() for i, (sentence, pos_tags) in enumerate(izip(args.sentences, args.pos)): words = tokenize_words(sentence) pos = tokenize_words(pos_tags) if len(words) != len(pos): print >>sys.stderr, 'Sentence has %d words, but POS has %d' \ % (len(words), len(pos)) print >> sys.stderr, words print >> sys.stderr, pos print ' '.join(words) continue for j, word in enumerate(words): if word == '<unknown>': words[j] = pos[j] print ' '.join(words)
help='Pickle file with errors broken down by POS tag') return parser if __name__ == "__main__": args = opts().parse_args() counts = defaultdict(lambda: defaultdict(int)) nerrors = 0 nsentences = 0 for sentence, ref_sentence, i_removed in zip(args.sample, args.gold, args.removed): try: i_removed = int(i_removed) words = tokenize_words(sentence) ref_words = tokenize_words(ref_sentence) assert len(words) == len(ref_words) - 1 pos = list(map(pos_tag, words)) ref_pos = list(map(pos_tag, ref_words)) has_error = False for i in range(i_removed): counts[pos[i]][ref_pos[i]] += 1 has_error |= (pos[i] != ref_pos[i]) for i in range(i_removed, len(words)): counts[pos[i]][ref_pos[i + 1]] += 1 has_error |= (pos[i] != ref_pos[i + 1]) if has_error: nerrors += 1 sys.stdout.write(ref_sentence)
import sys, argparse from itertools import izip from util import tokenize_words, UNKNOWN def load_mapping(istream): mapping = {} for line in istream: from_word, to_word = line.rstrip().split() mapping[from_word] = to_word return mapping def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('sentences', type=argparse.FileType('r'), help='File with sentences') parser.add_argument('mapping', type=argparse.FileType('r'), help='File with word map') return parser if __name__ == "__main__": args = opts().parse_args() mapping = load_mapping(args.mapping) for i, sentence in enumerate(args.sentences): words = [mapping.get(w, UNKNOWN) for w in tokenize_words(sentence)] print ' '.join(words) if i % 500000 == 0: print >>sys.stderr, i
#!/usr/bin/env python '''Replace words with their word2vec class''' import sys, argparse from util import tokenize_words, load_vocab, UNKNOWN def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('classes', type=argparse.FileType('r'), help='File with word2vec classes') return parser if __name__ == "__main__": args = opts().parse_args() print("Loading word2vec classes", file=sys.stderr) vocab = load_vocab(args.classes) for i, line in enumerate(sys.stdin): words = [vocab.get(w, UNKNOWN) for w in tokenize_words(line)] print(' '.join(map(str, words))) if i % 100000 == 0: print(i, file=sys.stderr)