def read_files_count(filepath):
    tessobj = TessFile(filepath)
    tokengenerator = iter(tessobj.read_tokens())
    stop = 0
    while stop != 1:
        try:
            rawtoken = next(tokengenerator)
            cleantoken_list = token_cleanup(rawtoken) 
            token = cleantoken_list[0]
            countgram(token)
        except StopIteration:
            stop = 1
예제 #2
0
def read_files(filepath):
    '''Moves through a .tess file and calls the 'next' and 'count_lemma' functions as needed.
    Updates the SKIP_LIBRARY global object.
    Parameters
    ----------
    filepath: a file in .tess format
    '''
    tessobj = TessFile(filepath)
    tokengenerator = iter(tessobj.read_tokens())
    stop = 0
    while stop != 1:
        try: 
            rawtoken = next(tokengenerator)
            cleantoken_list = token_cleanup(rawtoken)
            count_lemma(cleantoken_list[0])
        except StopIteration:
            stop = 1
예제 #3
0
    def test_read_tokens(self, tessfile_list):
        for f in tessfile_list:
            lines = []
            with open(f, 'r') as tess:
                for line in tess.readlines():
                    lines.append(line)

            t_b = TessFile(f)
            t_r = TessFile(f, buffer=False)

            # Ensure that tokens omit the tag when requested
            # Grab all tokens from the text
            tokens = []
            for line in lines:
                start = line.find('>')
                if start >= 0:
                    tokens.extend(line[start + 1:].strip(
                        string.whitespace).split())

            # Test with buffer
            for i, token in enumerate(t_b.read_tokens()):
                # print(token, tokens[i])
                assert token == tokens[i]

            # Ensure that the iterator resets
            reset = False
            for i, token in enumerate(t_b.read_tokens()):
                assert token == tokens[i]
                reset = True
            assert reset

            # Test with initial read
            for i, token in enumerate(t_r.read_tokens()):
                assert token == tokens[i]

            # Ensure that the iterator resets
            reset = False
            for i, token in enumerate(t_r.read_tokens()):
                assert token == tokens[i]
                reset = True
            assert reset

            # Ensure that tokens include the tag when requested
            # Lines now start before the tag
            tokens = []
            for line in lines:
                tokens.extend(line.strip().split())

            # Test with buffer
            for i, token in enumerate(t_b.read_tokens(include_tag=True)):
                print(token, tokens[i])
                assert token == tokens[i]

            # Ensure that the iterator resets
            reset = False
            for i, token in enumerate(t_b.read_tokens(include_tag=True)):
                assert token == tokens[i]
                reset = True
            assert reset

            # Test with initial read
            for i, token in enumerate(t_r.read_tokens(include_tag=True)):
                assert token == tokens[i]

            # Ensure that the iterator resets
            reset = False
            for i, token in enumerate(t_r.read_tokens(include_tag=True)):
                assert token == tokens[i]
                reset = True
            assert reset
예제 #4
0
        all_lemmas_total = sum([COUNT_LIBRARY[l] for l in lemmas])
        try:
            lemmalist = [(l, (COUNT_LIBRARY[l] / all_lemmas_total))
                         for l in lemmas]
        except ZeroDivisionError:
            print([(COUNT_LIBRARY[l], l) for l in lemmas])
        return lemmalist
    else:
        lemmalist = []
        lemmaobj = (lemmas[0], 1)
        lemmalist.append(lemmaobj)
        return lemmalist


tessobj = TessFile(onlyfiles[258])
tokengenerator = iter(tessobj.read_tokens())
tokens = new_file(tokengenerator, 2)
target = tokens.pop(0)
compare_context(target, tokens)

rel_path = os.path.join(
    '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)
file = 'latin_pos_lemmatized_sents.pickle'
latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    print('The file %s is not available in cltk_data' % file)

first1000 = latin_pos_lemmatized_sents[0:1000]
def read_files_skipgram(filepath, context_window):
    '''Moves through a .tess file and calls the 'next' and 'skipgram' functions as needed.
    Updates the SKIP_LIBRARY global object.
    Parameters
    ----------
    filepath: a file in .tess format
    context_window: how many words on either side of the target to look at.
    '''
    tessobj = TessFile(filepath)
    tokengenerator = iter(tessobj.read_tokens())
    tokens = new_file(tokengenerator, context_window)
    stop = 0
    clearflag = 0
    target_position = context_window
    while stop != 1:
        #the target should be five away from the end of the file, until the end
        # can't just pop the target token; we want to keep it for the next round.
        targettoken = tokens[target_position]
        #grab all the other tokens but the target
        contexttokens = [x for i, x in enumerate(tokens) if i != target_position]
        #add this context to the skipgram map
        skipgram(targettoken, contexttokens)
        #prep the next token in the file
        try:
            rawtoken = next(tokengenerator)
            cleantoken_list = token_cleanup(rawtoken) 
            if len(cleantoken_list) > 1 and cleantoken_list[-1] in punctuation_list:
                #this should indicate a sentence has ended.
                #when this happens, it's necessary to clear the list *after* this iteration.
                clearflag = 1
            tokens.append(cleantoken_list[0])
            # if we've seen end-of-sentence punctuation, we need to start counting down.
            if clearflag == 1:
                # when this begins, the token list just received the final word.
                tokens.pop(0)
                while len(tokens) > context_window:
                    # perform the usual dictionary operation, but don't add a new token.
                    targettoken = tokens[target_position]
                    contexttokens = [x for i, x in enumerate(tokens) if i != target_position]
                    skipgram(targettoken, contexttokens)
                    tokens.pop(0)
                #initialize the next sentence
                tokens = []
                tokens = new_file(tokengenerator, context_window)
                clearflag = 0
            else:
                tokens.pop(0)
        except StopIteration:
            #we have reached EOF. Loop through until the last token is done then quit
            #when this happens, the token list should have 11 indices, and the 'target_position'
            #index will be the sixth (i.e. :tokens[5]). Pop the first index off, leaving 10
            #indices and making the sixth index (previously the seventh) the new target.
            # this entire loop is obsolete now that punctuation is accounted for.
            try:
                tokens.pop(0)
            except IndexError:
                pass
            while len(tokens) > (context_window):
                # This loop makes the target_position move to the end. E.g. if the context_window is 6, then
                # as long as there are six or more indexes, make the target_position the sixth index.
                targettoken = tokens[target_position]
                #grab all the other tokens but the target
                contexttokens = [x for i, x in enumerate(tokens) if i != target_position]
                #add this context to the skipgram map
                skipgram(targettoken, contexttokens)
                tokens.pop(0)
            stop = 1