예제 #1
0
def worker_process(i, jobs_queue, output_queue, args):
    if not args.disable_lm_filter:
        lm_filter = load_lm_filter(args.source_lang, args.target_lang,
                                   args.metadata_yaml,
                                   args.source_tokenizer_command,
                                   args.target_tokenizer_command)
    else:
        lm_filter = None

    if not args.disable_porn_removal:
        porn_removal = args.porn_removal
        if args.metadata_yaml['porn_removal_side'] == 'tl':
            porn_tokenizer = Tokenizer(args.target_tokenizer_command,
                                       args.target_lang)
        else:
            porn_tokenizer = Tokenizer(args.source_tokenizer_command,
                                       args.source_lang)
    else:
        porn_removal = None
        porn_tokenizer = None

    while True:
        job = jobs_queue.get()
        if job:
            logging.debug("Job {0}".format(job.__repr__()))
            nblock, filein_name = job
            ojob = None
            with open(filein_name, 'r') as filein, NamedTemporaryFile(
                    mode="w", delete=False, dir=args.tmp_dir) as fileout:
                logging.debug(
                    "Classification: creating temporary filename {0}".format(
                        fileout.name))

                for i in filein:
                    parts = i.strip().split("\t")
                    left = ""
                    right = ""

                    if len(parts) >= args.scol and len(parts) >= args.tcol:
                        left = parts[args.scol - 1]
                        right = parts[args.tcol - 1]
                    else:
                        logging.error(
                            "WARNING: scol ({}) or tcol ({}) indexes above column number ({})"
                            .format(args.scol, args.tcol, len(parts)))
                        continue
                    wrong_tu_results = wrong_tu(left, right, args, lm_filter,
                                                porn_removal, porn_tokenizer)
                    if wrong_tu_results != False:
                        fileout.write("\t".join(parts) + "\t0")
                        if args.annotated_output:
                            fileout.write("\t{}\n".format(wrong_tu_results))
                        else:
                            fileout.write("\n")
                    else:
                        fileout.write("\t".join(parts) + "\t1")
                        if args.annotated_output:
                            fileout.write("\tkeep\n")
                        else:
                            fileout.write("\n")

                ojob = (nblock, fileout.name)
                filein.close()
                fileout.close()

            if ojob:
                output_queue.put(ojob)

            os.unlink(filein_name)
        else:
            logging.debug("Exiting worker")
            break
예제 #2
0
import json
from tokenizer import Tokenizer
import time

if __name__ == "__main__":
    tokens = Tokenizer()
    tokens.read_data('./WEBPAGES_RAW/bookkeeping.json')
    start = time.time()
    tokens.find_files()
    tokens.find_single_file("39/373", "mondego.ics.uci.edu/datasets/maven-contents.txt")
    tokens.compute_tf_idf_and_insert_db()
    end = time.time()
    time = end - start
    full_time = "Hours: " + str(time / 60 / 60) + ", Minutes:" + str(time / 60)
    print("TOTAL TOKENS: ", tokens.database.total_documents())
    print(full_time)
예제 #3
0
def collate(data: List[str], tokenizer: Tokenizer, block_size: int) -> Batch:
    ids = tokenizer.encode(data, block_size)
    mask = tokenizer.mask(ids)
    return Batch(ids=ids, attention_mask=mask)


def build_data_iterator(tokenizer,
                        dataset,
                        batch_size,
                        block_size,
                        random_sampler=False) -> DataLoader:
    sampler = RandomSampler(dataset) if random_sampler else SequentialSampler(
        dataset)
    iterator = DataLoader(
        dataset,
        sampler=sampler,
        batch_size=batch_size,
        collate_fn=lambda data: collate(data, tokenizer, block_size),
    )
    return iterator


if __name__ == "__main__":
    tokenizer = Tokenizer("tokenizer.model")
    with open("corpus.txt", encoding="utf-8") as f:
        dataset = f.readlines()
    iterator = build_data_iterator(tokenizer, dataset, 8, 128)
    batch = next(iter(iterator))
    print(tokenizer.decode(batch[0]))
예제 #4
0
def prefix_parser(str_statement):
    tokenizer = Tokenizer(str_statement)
    return prefix_parser_recursive(tokenizer)
예제 #5
0
    print('joke:\n', joke)
    print()

    print('generated explanation:\n', output)
    print()

    if true_output_text is not None:
        print('true explanation:\n', true_output_text)
        print()


if __name__ == '__main__':
    # load dataset
    print('loading dataset')
    tokenizer = Tokenizer(dataset_path)
    vocab_size = tokenizer.vocab_size

    model_vars = MyModel(vocab_size=vocab_size,
                         embedding_dim=embedding_dim,
                         rnn_units=rnn_units)
    model = tf.keras.Model(inputs=[model_vars.encoder_input, model_vars.decoder_input],
                           outputs=model_vars.decoder_output)

    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
    model.compile(optimizer=optimizer,
                  loss=model_vars.sparse_cross_entropy,
                  target_tensors=[model_vars.decoder_target])

    checkpoint_dir = os.path.dirname(checkpoint_path)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
예제 #6
0
        ]

        UseDetails = True if "-d" in params else False

        print "STARTED:", str(datetime.now())
        start = time.time()

        morph = get_morph(
            os.path.join(
                os.path.dirname(sys.argv[0]),
                "pydicts").decode("UTF8"))  # Подгружаем русский словарь
        morph_simple = get_morph(
            os.path.join(os.path.dirname(sys.argv[0]),
                         "pydicts").decode("UTF8"),
            check_prefixes=False)  # Подгружаем русский словарь
        tok = Tokenizer()  # Подгружаем токенизатор
        tagger = Tagger(morph, morph_simple)  # Подгружаем тэггер
        syner = Synonymizer(morph_simple, params)  # Подгружаем синонимизатор

        print "Synonymizer statistics loaded! It took", time.time() - start

        # Чтение файла (2 попытки: cp1251 и utf-8)
        try:
            text = read_file(filename)
        except Exception as e:
            error(
                "Encoding detection failed! Windows-1251 or UTF-8 without BOM expected.",
                syner.UseDetails, str(e))
            sys.exit()

        tokens = tok.tokenize(text)
예제 #7
0
import pickle
import sys

from neural_network import create_nn
from tokenizer import Tokenizer

data = pickle.load(open('model.bin', 'rb'))
layout = data["layout"]
weights = data["weights"]

tokenizer = Tokenizer(data["dictionary"])

sentence = ' '.join(sys.argv[1:])

call_nn = create_nn(layout, weights)


def wrap_nn(call_nn, tokenizer: Tokenizer):
    def call(sentence: str):
        input_layer = tokenizer.input_layer_of_sentence(sentence)
        output_layer = call_nn(input_layer)
        return output_layer[0]

    return call


sentence_nn = wrap_nn(call_nn, tokenizer)

output = sentence_nn(sentence)

bzh = output > 0
예제 #8
0
def tokenize_raw_text(raw_text: str) -> list:
    tokenizer = Tokenizer(clean_empty_lines(clean_html(raw_text)))
    return tokenizer.tokenize()
예제 #9
0
 def test_get_parsed_data(self):
     token = Tokenizer('../data/input.txt')
     lines = token.get_parsed_data()
     self.assertEqual(len(lines), 7)
예제 #10
0
 def __init__(self):
     self._tokenizer = Tokenizer()
예제 #11
0

def addx():
    x = get_value("_1") + get_value("_2")
    VARIABLES["_r"] = x
    return x


VARIABLES["add"] = addx


def ifx():
    if get_value("_1"):
        exe_func(get_func("_2"))


VARIABLES["if"] = ifx


def loopx():
    func = get_func("_1")
    continu = True
    while continu:
        exe_func(func)
        continu = get_value("_r")


VARIABLES["loop"] = loopx

exe_func(ParseMethod(Tokenizer(), False).stmts)
예제 #12
0
def convertToLaTeX(string):
    tokenizer = Tokenizer(scanner=Scanner(string))
    parser = Parser(tokenizer=tokenizer)
    return str(parser.parseCode())
예제 #13
0
from tokenizer import Tokenizer

tk = Tokenizer()
print(tk.getToken())
tk.changeId()
print(tk.getToken())
예제 #14
0
파일: cc0.py 프로젝트: yiranyyu/C0-Compiler
    while i < len(args):
        arg = args[i]
        if arg.startswith('-'):
            if arg == '-o':
                i += 1
            i += 1
            continue
        try:
            in_file = open(arg)
            break
        except IOError:
            print_error_msg_and_exit(f'Cannot open input file {in_file}')
    if in_file is sys.stdin:
        print_error_msg_and_exit(f'No input file')

    tokenizer = Tokenizer(in_file.read())
    try:
        tokens = tokenizer.all_tokens()
        analyser = Analyser(tokens)
        # analyser.c0_ast.draw()
        elf = analyser.generate()
        if '-s' in args:
            out_file.write(elf.generate_s0())
        elif '-c' in args:
            out_file.write(elf.generate_o0())

        if '-A' in args:
            analyser.c0_ast.draw(draw_full_ast=True)
        elif '-a' in args:
            analyser.c0_ast.draw(draw_full_ast=False)
    except (TokenizerException, ParserException, AnalyserException) as e:
예제 #15
0
    def read_text(self, filename):
        tokenizer = Tokenizer()
        words = []
        sentences = []
        with open(filename, 'r', encoding='utf-8') as file:
            lines = file.read().lower()
            for line in lines.split('\n'):
                sentences += tokenizer.split_into_sentences(line)
            for line in sentences:
                candidate_words = tokenizer.split_into_words(line)
                words += self.clean_words(candidate_words)

        counts = Counter(words)
        for key in counts:
            if counts[key] < self.minimal_frequency:
                self.low_frequency_words.append(key)

        for key in self.low_frequency_words:
            del counts[key]
        print(counts)

        word_pair_frequencies = {}
        for sent in sentences:
            words = tokenizer.split_into_words(sent)
            words = self.clean_words(words)
            for word_position in range(len(words)):
                word = words[word_position]

                if word not in word_pair_frequencies:
                    word_pair_frequencies[word] = []

                start_pos = max(0, word_position - self.window_size)
                end_pos = min(len(words) - 1, word_position + self.window_size)
                for second_word_position in range(start_pos, end_pos):
                    second_word = words[second_word_position]
                    distance = abs(second_word_position - word_position)
                    if distance == 0:
                        continue
                    inverse_distance = self.window_size - distance + 1
                    if second_word != word:
                        word_pair_frequencies[word].append(
                            (second_word, inverse_distance))

        for key in word_pair_frequencies.keys():
            words = []
            word_distances = word_pair_frequencies[key]
            word_distances_sums = {}
            distances_sum = 0
            for word_distance in word_distances:
                word = word_distance[0]
                distance = word_distance[1]
                if word in word_distances_sums:
                    word_distances_sums[word] = +distance
                else:
                    word_distances_sums[word] = distance
                words.append(word)
            counter = Counter(words)
            for word in counter:
                counter[word] *= math.sqrt(word_distances_sums[word])
            word_pair_frequencies[key] = counter

        self.vocab_size = len(counts.keys())

        number = 0
        for key in counts.keys():
            self.vocabulary_encoded[key] = number
            self.vocabulary.append(key)
            number += 1

        self.target_probabilities = []
        for i in range(self.vocab_size):
            word = self.vocabulary[i]
            frequencies = word_pair_frequencies[word]
            probabilities = []
            for j in range(self.vocab_size):
                target_word = self.vocabulary[j]
                frequency = 0
                if target_word in frequencies:
                    frequency = frequencies[target_word]
                probabilities.append(frequency)

            self.target_probabilities.append(self.softmax(probabilities))
예제 #16
0
__author__ = 'Levon'
from tree import tree, parseTree,ExpressionError
from tokenizer import token, Tokenizer

pT = parseTree()
tok = Tokenizer()
assert(pT.buildParseTree(tok.tokenize("1+2")) == tree('+','1','2'))
assert(pT.buildParseTree(tok.tokenize("(x+(y*z+2))-3*((5+x)/2-4)")) == tree('-',tree('+','x',tree('+',tree('*','y','z'),'2')),tree('*','3',tree('-',tree('/',tree('+','5','x'),'2'),'4'))))
assert (pT.buildParseTree(tok.tokenize("sin(x)+ln(y)*3")) == tree('+',tree('sin','x'),tree('*',tree('ln','y'),'3')))
assert (pT.buildParseTree(tok.tokenize('x^y*2-3')) == tree('-',tree('*',tree('^','x','y'),'2'),'3'))
assert (pT.buildParseTree(tok.tokenize('x=y=5*3-20*sin(x+y)')) == tree('=','x',tree('=','y',tree('-',tree('*','5','3'),tree('*','20',tree('sin',tree('+','x','y')))))))
try:            # teste pentru erori
    Tree = pT.buildParseTree(tok.tokenize('x***y'))
    assert(False)
except ExpressionError:
    assert(True)
try:
    Tree = pT.buildParseTree(tok.tokenize('x===y'))
    assert(False)
except ExpressionError:
    assert(True)
try:
    Tree = pT.buildParseTree(tok.tokenize('x+++y'))
    assert(False)
except ExpressionError:
    assert(True)
try:
    Tree = pT.buildParseTree(tok.tokenize('+x*3'))
    assert(False)
except ExpressionError:
    assert(True)
예제 #17
0
    def tokenizer(self):
        if not self.expression:
            raise ValueError("Empty expression! Cannot process!")

        expressionLen = len(self.expression)
        i = 0
        number = ''
        current = ''

        while (i < expressionLen):

            current = self.expression[i]

            if (current == ' ' or current == '\n' or current == '\t'):
                i = i + 1
                continue

            elif current.isnumeric():

                while (current.isnumeric()):
                    number = number + current
                    i = i + 1
                    if (i >= expressionLen):
                        break
                    current = self.expression[i]

                if (current == '.'):

                    number = number + current
                    i = i + 1
                    current = self.expression[i]

                    if current.isnumeric():
                        while (current.isnumeric()):
                            number = number + current
                            i = i + 1
                            if (i >= expressionLen):
                                break
                            current = self.expression[i]
                    else:
                        raise ValueError("FLOAT USAGE: [0-9].[0-9][0-9]* ")

                token = Tokenizer('Number', float(number))

                i = i - 1
                number = ''

            elif current == '+':
                token = Tokenizer('+')

            elif current == '-':
                token = Tokenizer('-')

            elif current == '*':
                token = Tokenizer('*')

            elif current == '/':
                token = Tokenizer('/')

            elif current == '^':
                token = Tokenizer('^')

            elif current == '(':
                token = Tokenizer('(')

            elif current == ')':
                token = Tokenizer(')')

            elif current == '@':

                variable = ''
                i = i + 1
                current = self.expression[i]

                while (current.isalpha()):
                    variable = variable + current
                    i = i + 1
                    if (i >= expressionLen):
                        break
                    current = self.expression[i]

                # if :
                var = Var.getInstance()
                val = var.getVar(variable)
                if val == None:
                    raise ValueError('Variable => ' + variable +
                                     ' not declared')

                token = Tokenizer('Number', val[1])
                i = i - 1

            elif current.isalpha():
                variable = ''
                while (current.isalpha()):
                    variable = variable + current
                    i = i + 1
                    current = self.expression[i]

                token = Tokenizer('Variable', variable)
                i = i - 1

            elif current == '=':
                token = Tokenizer('=')

            else:
                raise ValueError("INVALID TOKEN => %s" % current)

            i = i + 1
            if token:
                self.tokens.append(token)
예제 #18
0
 def test_parser(self):
     for src_filename in os.listdir(TEST_CASES_DIR):
         logger.debug("tokenizing {}".format(src_filename))
         lst_lines = Tokenizer(os.path.join(TEST_CASES_DIR, src_filename)).tokenize()
         Parser().parse_tokens(lst_lines)
예제 #19
0
 def classify(self, msg): # read
     my_freq = Tokenizer(msg, self.tagger, self.common).get_freq()
     cls = self.__cos_sim(my_freq)
     return cls
예제 #20
0
 def test_init(self):
     for src_filename in os.listdir(TEST_CASES_DIR):
         Tokenizer(os.path.join(TEST_CASES_DIR, src_filename))
예제 #21
0
from density_calculator import DensityCalculator
from tokenizer import Tokenizer
from filters import StopwordsFilter

calculator = DensityCalculator(Tokenizer(), StopwordsFilter('en'))

densities = calculator(
    ''' To follow along with future lessons it is important that you have the right files and programs in your programming-historian directory. At the end of each lesson in this series you can download the programming-historian zip file to make sure you have the correct code.'''
)

print densities
예제 #22
0
 def test_tokenize(self):
     for src_filename in os.listdir(TEST_CASES_DIR):
         logger.debug("tokenizing {}".format(src_filename))
         Tokenizer(os.path.join(TEST_CASES_DIR, src_filename)).tokenize()
예제 #23
0
 def parse(self, source):
     t = Tokenizer()
     return self._parse_statements(peekable(t.tokenize(source)))
예제 #24
0
 def findDeviceDeclarations(self):
     t = Tokenizer(self.deprocessedFile)
     for token in t.tokenize():
         self.allTokens.append(token)
     m = Match()
     self.deviceDclLines = m.match_device_function(self.allTokens)
    # 文章的预处理,这里暂不处理
    return content

def load_sentences(filepath, shuffle=True):
    readList = []
    with open(filepath, 'rb') as f:
        readList=pickle.load(f)
    samples = []
    for item in readList:
        samples.append("".join(item[0]))
    if shuffle:
        random.shuffle(samples)
    return samples

file = "./static_model/vocabs.json"
tokenizer = Tokenizer(mintf, processes)
if os.path.exists(file):
    # X = load_sentences('./train_all_1209.pkl')
    tokenizer.load_vocab_from_file(file)
    # tokenizer.load(file, X)
# else:
    # X = load_sentences('./train_all_1209.pkl')
    # print("tokenize...")
    # tokenizer.fit_in_parallel(X)
    # tokenizer.save(file)

words = tokenizer.words
word2id = tokenizer.word2id
id2word = {j:i for i,j in word2id.items()}
vocab_size = len(word2id)
예제 #26
0
파일: index.py 프로젝트: lucasns/projeto-ri
 def __init__(self, use_stemming=True, remove_stopwords=True):
     self._tokenizer = Tokenizer(use_stemming, remove_stopwords)
예제 #27
0
import sys
from tokenizer import Tokenizer
from syntax_parser import Syntax
from grammar_parser import Grammar

with open("grammar.txt", "r") as grammar_file:
    grammar_str = grammar_file.read()
with open(sys.argv[1]) as code_file:
    code_str = code_file.read()

grammar = Grammar(grammar_str)
tokenizer = Tokenizer(grammar, code_str)
syntax = Syntax(grammar, tokenizer)
print("Compiled")
예제 #28
0
파일: index.py 프로젝트: lucasns/projeto-ri
 def __init__(self,
              use_compression=True,
              use_stemming=True,
              remove_stopwords=True):
     self.use_compression = use_compression
     self._tokenizer = Tokenizer(use_stemming, remove_stopwords)
예제 #29
0
    def reverse(self, class_id):
        return

    def __bool__(self):
        return True


# test
from tokenizer import Tokenizer

raw_docs = [
    " Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.",
    " Schwan-STABILO is a German maker of pens for writing colouring and cosmetics as well as markers and highlighters for office use. It is the world's largest manufacturer of highlighter pens Stabilo Boss."
    " Q-workshop is a Polish company located in Poznań that specializes in designand production of polyhedral dice and dice accessories for use in various games (role-playing gamesboard games and tabletop wargames). They also run an online retail store and maintainan active forum community.Q-workshop was established in 2001 by Patryk Strzelewicz – a student from Poznań. Initiallythe company sold its products via online auction services but in 2005 a website and online store wereestablished."
]

if __name__ == '__main__':
    tokenizer = Tokenizer()

    emb_vocab = EmbVocabulary(50, tokenizer.tokenizer0)
    #fit
    emb_vocab.fit()
    # get ids
    for vec in emb_vocab.get_vec(raw_docs):
        print(vec)
        print(vec.shape)

    vec_list = list(emb_vocab.get_vec(raw_docs))
    for v in vec_list:
        print(v)
예제 #30
0
    return vocabulary


sentences = np.genfromtxt('./tickets_QIT.txt', delimiter='\n', dtype=str)
language = 'italian'
max_words = None
max_length = 30

# Text preprocessor with no functionalities whatsoever
prep = TextPreprocessor(sentences)

# Add decorator to clean email bodies
prep = QITEmailBodyCleaner(prep)

# Add tokenizer decorator
prep = Tokenizer(prep, language)

# Get intermediate results
tokens = prep.preprocess()

# Build vocabulary
vocabulary = build_vocabulary(tokens, max_words=max_words)

# Add integer encoding decorator
unknown_token_id = max(vocabulary.values()) + 1
prep = IntegerEncoder(prep, vocabulary, unknown_token_id)

# Add padding decorator
padding_token_id = max(vocabulary.values()) + 2
prep = Padder(prep, padding_token_id, max_length)