def tag_files(): path = "untagged/" tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle') files = FileReadingFuncts.get_files(path) for file in files: data = FileReadingFuncts.read_file(path, file) #tags the times entities_to_tag = TimeFunct.get_end_time_examples(data) data = tag_entities(data, entities_to_tag) #tags the named entitites data = tag_named_entities(data) #tags the sentences sentences = SentenceTaggingFuncts.get_sentences(data) entities_to_tag = get_in_dict(sentences, "sentence") data = tag_entities(data, entities_to_tag) #tags the paragraphs paragraphs = SentenceTaggingFuncts.get_paragraphs(data) entities_to_tag = get_in_dict(paragraphs, "paragraph") data = tag_entities(data, entities_to_tag) FileWritingFuncts.writeTaggedFile(data, file)
def createsFiles(tag_names): #reads in the tags path = "training/" files = FileReadingFuncts.get_files(path) matches = getblank2d(len(tag_names)) for file in files: data = FileReadingFuncts.read_file(path, file) for i in range(0, len(tag_names)): newMatches = TagExtractingFuncts.find_tag_matches(tag_names[i], data) matches[i] = matches[i] + (newMatches) #writes the tags to the files for i in range(0, len(tag_names)): outputFile = "tagFiles/{}.txt".format(tag_names[i]) writeFile(set(matches[i]), outputFile)
def writeWikiFile(tag_name): #reads in all the example tags from the tag file entities = FileReadingFuncts.read_all_lines("tagFiles/{}.txt".format(tag_name))[:20] #gets the words related to these from wikipedia words = [] for entity in entities: words = words + WikipediaFuncts.get_words(entity) #writes these words to the output file output_file = "wiki/{}1.txt".format(tag_name) writeFile(words, output_file)
def get_all_tags(path, file, tag_name): #reads in the file: data = FileReadingFuncts.read_file(path, file) #finds all the matches tagged_matches = TagExtractingFuncts.find_tag_matches(tag_name, data) #removes the tags from every match untagged_matches = [] for match in tagged_matches: match = match.replace(".", "") untagged_matches.append(TagExtractingFuncts.get_rid_of_tags(match)) return untagged_matches
def build_vocab(tag_name, tag_dict): print("entered") #takes all the entities with a certain tag name from one string of data from a single file #then it goes on wikipedia and gets all the words that come up when you search that string examples = get_examples(tag_name) word_file = "wiki/{}.txt".format(tag_name) words = FileReadingFuncts.read_wiki(word_file) for example in examples: key = example if key in tag_dict: key = key + "z" tag_dict[key] = tag_name return words, tag_dict
def read_in_tags(): tag_names = [ "sentence", "paragraph", "speaker", "location", "etime", "stime" ] #creates a dictionary in which to store all the tag scores tag_scores = TagScoreCalculator.create_tag_dict(tag_names) #(I only have to read in the file nams from one directory because #they're the same for both) for tagFile in FileReadingFuncts.get_files("test_tagged/"): for tag_name in tag_names: #reads in all the tagged stuff from the tagged files test_tags = get_all_tags("test_tagged/", tagFile, tag_name) my_tags = get_all_tags("my_tagged/", tagFile, tag_name) #adds the tags I read in into my tag_score dictionary tag_scores[tag_name].add_tags(test_tags, my_tags) return tag_scores
def get_examples(tag_name): inputFile = "tagFiles/" + tag_name + ".txt" examples = FileReadingFuncts.read_all_lines(inputFile) return examples
def get_examples(tag_name): file = "tagFiles/{}.txt".format(tag_name) examples = FileReadingFuncts.read_all_lines(file) return examples