Пример #1
0
def tag_files():
    path = "untagged/"
    tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
    files = FileReadingFuncts.get_files(path)
    for file in files:
        data = FileReadingFuncts.read_file(path, file)

        #tags the times
        entities_to_tag = TimeFunct.get_end_time_examples(data)
        data = tag_entities(data, entities_to_tag)

        #tags the named entitites
        data = tag_named_entities(data)

        #tags the sentences
        sentences = SentenceTaggingFuncts.get_sentences(data)
        entities_to_tag = get_in_dict(sentences, "sentence")
        data = tag_entities(data, entities_to_tag)

        #tags the paragraphs
        paragraphs = SentenceTaggingFuncts.get_paragraphs(data)
        entities_to_tag = get_in_dict(paragraphs, "paragraph")
        data = tag_entities(data, entities_to_tag)

        FileWritingFuncts.writeTaggedFile(data, file)
Пример #2
0
def createsFiles(tag_names):

    #reads in the tags
    path = "training/"
    
    files = FileReadingFuncts.get_files(path)

    matches = getblank2d(len(tag_names))

    for file in files:
        data = FileReadingFuncts.read_file(path, file)
        for  i in range(0, len(tag_names)):
            newMatches  = TagExtractingFuncts.find_tag_matches(tag_names[i], data)
            matches[i] = matches[i] + (newMatches)


    #writes the tags to the files
    for i in range(0, len(tag_names)):
        outputFile = "tagFiles/{}.txt".format(tag_names[i])
        writeFile(set(matches[i]), outputFile)
Пример #3
0
def writeWikiFile(tag_name):
    #reads in all the example tags from the tag file
    entities = FileReadingFuncts.read_all_lines("tagFiles/{}.txt".format(tag_name))[:20]

    #gets the words related to these from wikipedia
    words = []
    for entity in entities:
        words = words + WikipediaFuncts.get_words(entity)

    #writes these words to the output file
    output_file = "wiki/{}1.txt".format(tag_name)
    writeFile(words, output_file)
Пример #4
0
def get_all_tags(path, file, tag_name):
    #reads in the file:
    data = FileReadingFuncts.read_file(path, file)

    #finds all the matches
    tagged_matches = TagExtractingFuncts.find_tag_matches(tag_name, data)

    #removes the tags from every match
    untagged_matches = []
    for match in tagged_matches:
        match = match.replace(".", "")
        untagged_matches.append(TagExtractingFuncts.get_rid_of_tags(match))

    return untagged_matches
Пример #5
0
def build_vocab(tag_name, tag_dict):
    print("entered")
    #takes all the entities with a certain tag name from one string of data from a single file
    #then it goes on wikipedia and gets all the words that come up when you search that string

    examples = get_examples(tag_name)

    word_file = "wiki/{}.txt".format(tag_name)
    words = FileReadingFuncts.read_wiki(word_file)

    for example in examples:
        key = example
        if key in tag_dict:
            key = key + "z"
        tag_dict[key] = tag_name

    return words, tag_dict
Пример #6
0
def read_in_tags():
    tag_names = [
        "sentence", "paragraph", "speaker", "location", "etime", "stime"
    ]

    #creates a dictionary in which to store all the tag scores
    tag_scores = TagScoreCalculator.create_tag_dict(tag_names)

    #(I only have to read in the file nams from one directory because
    #they're the same for both)
    for tagFile in FileReadingFuncts.get_files("test_tagged/"):
        for tag_name in tag_names:
            #reads in all the tagged stuff from the tagged files
            test_tags = get_all_tags("test_tagged/", tagFile, tag_name)
            my_tags = get_all_tags("my_tagged/", tagFile, tag_name)

            #adds the tags I read in into my tag_score dictionary
            tag_scores[tag_name].add_tags(test_tags, my_tags)

    return tag_scores
Пример #7
0
def get_examples(tag_name):
    inputFile = "tagFiles/" + tag_name + ".txt"
    examples = FileReadingFuncts.read_all_lines(inputFile)
    return examples
Пример #8
0
def get_examples(tag_name):
    file = "tagFiles/{}.txt".format(tag_name)
    examples = FileReadingFuncts.read_all_lines(file)
    return examples