def search_tokens(): n = int(sys.argv[2]) tokenizer = Tokenizer() for query in interactive_sequence_generator(): if query.startswith(' '): query_token = Token(query[1:], True) else: query_token = Token(query, False) for sequence in Wikipedia.training_sequences(n): tokens = tokenizer.tokenize(sequence) if query_token in tokens: print(sequence)
predicted = reinsert_punctuation(segmented, sequence) #print(predicted) predicted = self.postprocessor.correct(predicted) return predicted if __name__ == "__main__": if len(sys.argv) > 1: benchmark_name = sys.argv[1] subset = SUBSETS[sys.argv[2]] benchmark = Benchmark(benchmark_name, subset) sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) writer = PredictionsFileWriter(benchmark.get_results_directory() + "wordsegment.txt") else: sequences = interactive_sequence_generator() writer = None segmenter = WordSegment() for s_i, sequence in enumerate(sequences): start_time = timestamp() try: predicted = segmenter.correct(sequence) except RecursionError: predicted = sequence runtime = time_diff(start_time) print(predicted) if writer is not None: writer.add(predicted, runtime)
"""Roachdale is a town in Franklin and Jackson townships, Putnam County, in the U.S. state of Indiana. The population was 926 at the 2010 census.""", """He works for Prof. Dr. Prename Lastname.""", """She was entitled Dr. Prename Lastname.""", """He is born in Washington D.C. in the U.S.A. and lived there.""", """I did three things, e.g. one thing.""", """I did more, e. g. another thing.""", """Read sentences, i.e. this sentence.""", """She met Mr. Lastname and Mrs. Lastname at their house.""", """The vote elected Mr. Lastname as president.""", """The vote elected Mrs. Lastname as president.""", """Prename Lastname (ca. 1950-2000) lived.""" ] if __name__ == "__main__": if "i" in sys.argv: paragraphs = interactive_sequence_generator() elif "t" in sys.argv: paragraphs = [] wiki_paragraphs = read_sequences(paths.WIKI_TRAINING_PARAGRAPHS) for _ in range(1000): paragraphs.append(next(wiki_paragraphs)) else: paragraphs = test_sequences if "spacy" in sys.argv: splitter = SpacySentenceSplitter() elif "wiki" in sys.argv: print("loading wiki punkt tokenizer...") splitter = WikiPunktTokenizer() else: splitter = NLTKSentenceSplitter()
def query_unigrams(): holder = UnigramHolder() for query in interactive_sequence_generator(): print(holder.get(query))
import project from src.postprocessing.bigram import BigramPostprocessor from src.interactive.sequence_generator import interactive_sequence_generator if __name__ == "__main__": postprocessor = BigramPostprocessor() print("%i unigrams" % len(postprocessor.unigrams)) print("%i bigrams" % len(postprocessor.bigrams)) for sequence in interactive_sequence_generator(): predicted = postprocessor.correct(sequence) print(predicted)
import sys from project import src from src.helper.pickle import load_object from src.settings import paths from src.helper.data_structures import sort_dict_by_value from src.interactive.sequence_generator import interactive_sequence_generator if __name__ == "__main__": frequencies = load_object(paths.CHARACTER_FREQUENCY_DICT) sorted_frequencies = sort_dict_by_value(frequencies) ranks = {char: i for i, (char, frequency) in enumerate(sorted_frequencies)} if len(sys.argv) > 1: print_top_n = int(sys.argv[1]) for i, (frequency, char) in enumerate(sorted_frequencies[:print_top_n]): print(i, char, frequency) else: for char in interactive_sequence_generator(): freq = frequencies[char] rank = ranks[char] print("rank %i (frequency %i)" % (rank, freq))