Exemplo n.º 1
0
class PhraseTokenizer(PhraseSplitter):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.load()

    def tokenize(self, phrase):
        return self.tokenizer.tokenize(phrase)
Exemplo n.º 2
0
 def test_with_invalid_token(self):
     tokenizer = Tokenizer()
     try:
         tokenizer.get_data_by_token('invalid_token')
         raise Exception("Invalid token can't get an origin data")
     except InvalidTokenError:
         pass
Exemplo n.º 3
0
class PhraseCleaner:
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.load()

    def process(self, phrase):
        return u' '.join(self.tokenizer.tokenize(phrase))
Exemplo n.º 4
0
    def create_indexer(self):
        #get the list of ulrs to be removed
        with open('database/removed_urls.pkl', 'rb') as f:
            removed_url = pickle.load(f)

        #hash_doc file stores mapping of doc_id and url
        hash_doc = open(self.doc_file, "w+")
        for dir in self.path_to_db.iterdir():
            if dir.is_dir():
                for file in dir.iterdir():
                    if not file.is_file():
                        continue
                    with open(file, 'r', encoding="ascii",
                              errors="ignore") as file:
                        parsed_json = json.load(file)
                        url = parsed_json['url']
                        if url in removed_url:
                            continue
                        url = self.removeFragment(url)
                        content = parsed_json['content']

                    tokenizer = Tokenizer(content, self.ngram)
                    token_tf = tokenizer.extract_texts()
                    hash_doc.write("%d, %s, %d\n" %
                                   (self.count_files, url, tokenizer.length))
                    self.add_tokens_to_dictionary(token_tf, self.count_files)
                    self.count_files += 1

        hash_doc.close()
        self.save_to_file()
        self.recalculate_tf_idf()
Exemplo n.º 5
0
    def __init__(self, parent=None):
        super().__init__(parent=parent)
        # self.setFont(QFont("default", 9))
        self.setWindowOpacity(1)
        self.setWindowIcon(QIcon("assets/calculator.png"))
        self.setWindowTitle("Calculator")

        sci = CalculatorView(ctype=GeneralCalcView._scientific)
        vec1 = CalculatorView(ctype=GeneralCalcView._vector1d)
        vec2 = CalculatorView(ctype=GeneralCalcView._vector2d)

        sci_controller = CalculatorController(
            sci, InputController(GeneralCalcView._scientific),
            Tokenizer("real"))
        vec1_controller = CalculatorController(
            vec1, InputController(GeneralCalcView._vector1d),
            Tokenizer("vec1"))
        vec2_controller = CalculatorController(
            vec2, InputController(GeneralCalcView._vector2d),
            Tokenizer("vec2"))

        self.addTab(sci, QIcon("assets/calculator.png"),
                    GeneralCalcView._scientific)
        self.addTab(vec1, QIcon("assets/vector1d.png"),
                    GeneralCalcView._vector1d)
        self.addTab(vec2, QIcon("assets/vector2d.png"),
                    GeneralCalcView._vector2d)
Exemplo n.º 6
0
    def __init__(self,
                 img_width,
                 img_height,
                 n_chars=7,
                 chars=None,
                 labels_path='/path/to/the/annotated/file',
                 root_img_dir='/path/to/img/dir'):
        self.n_chars = n_chars

        if chars is None:
            self.chars = list(
                '1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
            )
        else:
            self.chars = list(chars)

        self.tokenizer = Tokenizer(self.chars)

        df = pd.read_csv(labels_path, dtype={'img_id': str})
        self.annotaded_data = df.loc[df['text'] != 'no_one']
        self.root_img_dir = root_img_dir

        self.img_trans = transforms.Compose([
            transforms.Resize((img_height, img_width)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])
Exemplo n.º 7
0
def tokenize_raw_text(save_dir):
    text_save_dir = os.path.join(save_dir, 'text_files')
    numpy_vectors_save_dir = os.path.join(save_dir, 'numpy_vectors')
    remove_folder(numpy_vectors_save_dir)
    make_folder(numpy_vectors_save_dir)
    hadms = []
    for filename in os.listdir(text_save_dir):
        if ".txt" in filename:
            hadm = filename.replace(".txt", "")
            hadms.append(hadm)
    log(f"Total number of text files in set: {len(hadms)}")

    log(f'Loading vocab dict saved during from {VOCAB_DICT_PATH}')
    with open(VOCAB_DICT_PATH, 'rb') as f:
        vocab = pickle.load(f)
    tokenizer = Tokenizer(vocab)

    for hadm in tqdm.tqdm(hadms, desc='Tokenizing raw patient notes'):
        text = open(os.path.join(text_save_dir,
                                 str(hadm) + ".txt"), "r").read()
        words = tokenizer.process(text)
        vector = []
        for word in words:
            if word in vocab:
                vector.append(vocab[word])
            elif tokenizer.only_numerals(word) and (
                    len(vector) == 0 or vector[-1] != vocab["<NUM>"]):
                vector.append(vocab["<NUM>"])

        mat = np.array(vector)
        # saving word indices to file
        write_file = os.path.join(numpy_vectors_save_dir, f"{hadm}.npy")
        np.save(write_file, mat)
Exemplo n.º 8
0
def generateYelpSentenceExample(filename):
  tok = Tokenizer(preserve_case=False)
  # extracting tokens
  for line in data.generateLine(filename):
    review = json.loads(line)
    tokens = tok.sentence_tokenize(review['text'])
    stars = int(review['stars'])
    yield tokens, stars
Exemplo n.º 9
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lemmatizer = Mystem()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
    def __init__(self, reduce_mode="gmean", device="cuda"):
        if device == "cpu":
            logger.warning("Running LMScorer on CPU. Scoring may be slow.")

        self.model = LMScorer.from_pretrained("gpt2",
                                              device=device,
                                              batch_size=1)
        self.reduce_mode = reduce_mode
        self.tokenizer = Tokenizer()
Exemplo n.º 11
0
 def test_tokenizer(self):
     tokenizer = Tokenizer()
     origin_data = {
         'some_key': 'some_value',
         'additional_key': 'additional_value'
     }
     token = tokenizer.get_token_by_data(origin_data)
     self.assertIsNotNone(token)
     data_from_token = tokenizer.get_data_by_token(token)
     self.assertEqual(origin_data, data_from_token)
Exemplo n.º 12
0
class PhraseLemmatizer(PhraseSplitter):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.load()
        self.lemmatizer = Mystem()

    def tokenize(self, phrase):
        words = self.tokenizer.tokenize(phrase)
        wx = u' '.join(words)
        return [l for l in self.lemmatizer.lemmatize(wx) if len(l.strip()) > 0]
Exemplo n.º 13
0
 def test_expired_token(self):
     tokenizer = Tokenizer()
     origin_data = {
         'some_key': 'some_value',
         'additional_key': 'additional_value'
     }
     token = tokenizer.get_token_by_data(origin_data, datetime(2018, 1, 1))
     try:
         tokenizer.get_data_by_token(token, datetime(2018, 1, 8))
         raise Exception('token must be expired')
     except TokenExpiredError:
         pass
Exemplo n.º 14
0
class PhraseStemmer(PhraseSplitter):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.stemmer = RussianStemmer()

    def tokenize(self, phrase):
        words = self.tokenizer.tokenize(phrase)
        wx = u' '.join(words)
        return [
            self.stemmer.stem(w) for w in self.tokenizer.tokenize(phrase)
            if len(w.strip()) > 0
        ]
Exemplo n.º 15
0
 def set_custom_word(self, path):
     self.check_detector_initialized()
     word_freqs = self.load_word_freq_dict(path)
     # 合并字典
     self.custom_word_freq.update(word_freqs)
     # 合并切词词典及自定义词典
     self.word_freq.update(self.custom_word_freq)
     self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                custom_word_freq_dict=self.custom_word_freq,
                                custom_confusion_dict=self.custom_confusion)
     for k, v in word_freqs.items():
         self.set_word_frequency(k, v)
     logger.debug('Loaded custom word path: %s, size: %d' %
                  (path, len(word_freqs)))
Exemplo n.º 16
0
    def __init__(self, img_width, img_height, ds_size, n_chars=4, chars=None):
        self.gen = ImageCaptcha(img_width, img_height)
        self.size = ds_size

        self.n_chars = n_chars

        if chars is None:
            self.chars = list('1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
        else:
            self.chars = list(chars)

        self.tokenizer = Tokenizer(self.chars)

        self.first_run = True
Exemplo n.º 17
0
 def setUp(self):
     self.tokenizer = Tokenizer({
         'instruction_names': ['MOV', 'JMP'],
         'macro_names': ['DAT'],
         'word_registers': WORD_REGISTERS,
         'byte_registers': BYTE_REGISTERS,
     })
class SentenceScorer:
    def __init__(self, reduce_mode="gmean", device="cuda"):
        if device == "cpu":
            logger.warning("Running LMScorer on CPU. Scoring may be slow.")

        self.model = LMScorer.from_pretrained("gpt2",
                                              device=device,
                                              batch_size=1)
        self.reduce_mode = reduce_mode
        self.tokenizer = Tokenizer()

    def score(self, sentence):
        sentence = self.tokenizer.detokenize(sentence)

        return self.model.sentence_score(sentence,
                                         reduce=self.reduce_mode,
                                         log=True)

    def select_best(self, sentences):
        scores = []

        for sent in sentences:
            sent_score = self.score(sent)
            scores.append((sent, sent_score))

        scores.sort(key=lambda x: x[1], reverse=True)
        # pp(scores)

        return scores[0][0]
Exemplo n.º 19
0
def predict(sentences,
            config_file,
            ner_model_list,
            pretrain_model_file,
            dense_layer_model_file,
            re_model_file,
            vocab_file=None,
            device='cpu'):
    cfg = Config()
    cfg.load_config(config_file)
    vocab_file = cfg.config['vocab'] if vocab_file is None else vocab_file
    vocab = load_vocab(vocab_file)

    tokenizer = Tokenizer(vocab)
    pretrian_checkpoint = torch.load(pretrain_model_file, map_location=device)
    ner_label2id = {'B': 0, 'I': 1, 'O': 2, 'X': 3, '[start]': 4, '[end]': 5}
    re_label2id = {
        "NA": 0,
        "gene_associated_with_disease": 1,
        "disease_associated_with_tissue": 2,
        "disease_associated_with_disease": 3,
        "tissue_associated_with_tissue": 4
    }
    res = ner_predict(sentences, ner_label2id, cfg, ner_model_list,
                      pretrian_checkpoint, vocab, tokenizer, device)
    entitys = [item['entity'] for item in res]
    for idx, sent in enumerate(sentences):
        relations = re_predict(sent, [entitys[idx]], re_label2id, cfg,
                               pretrian_checkpoint, dense_layer_model_file,
                               re_model_file, vocab, tokenizer, device)
        res[idx]['relation'] = relations
    return res
Exemplo n.º 20
0
    def initialize_detector_dict(self):
        t1 = time.time()
        self.confusions = dict()
        self.spec_nouns = self.load_dict(self.spec_nouns_path)
        self.gangtai = self.load_dict(self.gangtai_path)
        self.common_confusion = self.load_dict(self.common_confusion_path)

        self.confusions.update(self.spec_nouns)
        self.confusions.update(self.gangtai)
        self.confusions.update(self.common_confusion)
        self.confusions_words = list(self.confusions.keys())
        confusions_values = list(self.confusions.values())
        self.confusions_words.extend(confusions_values)

        # 词、频数dict
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)
        self.word_freq.update(self.custom_word_freq)
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_confusion_dict=self.confusions,
                                   custom_word_freq_dict=self.custom_word_freq)
        logger.debug('Loaded file: %s, size: %d, spend: %s s' %
                     (self.spec_nouns_path, len(
                         self.confusions), str(time.time() - t1)))
        self.initialized_detector_dict = True
Exemplo n.º 21
0
    def _initialize_detector(self):
        t1 = time.time()
        try:
            import kenlm
        except ImportError:
            raise ImportError(
                'pycorrector dependencies are not fully installed, '
                'they are required for statistical language model.'
                'Please use "pip install kenlm" to install it.'
                'if you are Win, Please install kenlm in cgwin.')
        if not os.path.exists(self.language_model_path):
            filename = self.pre_trained_language_models.get(
                self.language_model_path, 'zh_giga.no_cna_cmn.prune01244.klm')
            url = self.pre_trained_language_models.get(filename)
            get_file(filename,
                     url,
                     extract=True,
                     cache_dir=config.USER_DIR,
                     cache_subdir=config.USER_DATA_DIR,
                     verbose=1)
        self.lm = kenlm.Model(self.language_model_path)
        t2 = time.time()
        logger.debug('Loaded language model: %s, spend: %.3f s.' %
                     (self.language_model_path, t2 - t1))

        # 词、频数dict
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)
        self.word_freq.update(self.custom_word_freq)
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        t3 = time.time()
        logger.debug('Loaded dict file, spend: %.3f s.' % (t3 - t2))
        self.initialized_detector = True
Exemplo n.º 22
0
    def __init__(self, data_root, split='train', vocab_json=None):

        assert split in ['train', 'test', 'valid'], 'Invalid split'

        self.data_root = data_root
        self.df = pd.read_csv(os.path.join(self.data_root, 'data/', '{}_data.csv'.format(split)))
        self.lang = Tokenizer()
        if vocab_json is None:
            self.lang.add_words(self.df['action'])
            self.lang.add_words(self.df['object'])
            self.lang.add_words(self.df['location'])
            self.lang.make_dicts()
            self.vocab_json = 'word2idx.json'
            self.lang.export_json(self.vocab_json)
        else:
            self.vocab_json = vocab_json
            self.lang.import_json(vocab_json)
Exemplo n.º 23
0
class TextUtils(object):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.lemmatizer = Mystem()
        self.lexicon = Word2Lemmas()
        self.language_resources = LanguageResources()

    def load_dictionaries(self, data_folder):
        word2lemmas_path = os.path.join(data_folder, 'ru_word2lemma.tsv.gz')
        self.lexicon.load(word2lemmas_path)

    def canonize_text(self, s):
        # Удаляем два и более пробелов подряд, заменяя на один.
        s = re.sub("(\\s{2,})", ' ', s.strip())
        return s

    def ngrams(self, s, n):
        return [
            u''.join(z) for z in itertools.izip(*[s[i:] for i in range(n)])
        ]

    def words2str(self, words):
        return u' '.join(
            itertools.chain([BEG_WORD], filter(lambda z: len(z) > 0, words),
                            [END_WORD]))

    def tokenize(self, s):
        return self.tokenizer.tokenize(s)

    def lemmatize(self, s):
        words = self.tokenizer.tokenize(s)
        wx = u' '.join(words)
        return [l for l in self.lemmatizer.lemmatize(wx) if len(l.strip()) > 0]

    # Слева добавляем пустые слова
    def lpad_wordseq(self, words, n):
        return list(
            itertools.chain(itertools.repeat(PAD_WORD, n - len(words)), words))

    # Справа добавляем пустые слова
    def rpad_wordseq(self, words, n):
        return list(
            itertools.chain(words, itertools.repeat(PAD_WORD, n - len(words))))

    def get_lexicon(self):
        return self.lexicon
Exemplo n.º 24
0
class SSIGALPRDataset(Dataset):
    def __init__(self,
                 img_width,
                 img_height,
                 n_chars=7,
                 chars=None,
                 labels_path='/path/to/the/annotated/file',
                 root_img_dir='/path/to/img/dir'):
        self.n_chars = n_chars

        if chars is None:
            self.chars = list(
                '1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
            )
        else:
            self.chars = list(chars)

        self.tokenizer = Tokenizer(self.chars)

        df = pd.read_csv(labels_path, dtype={'img_id': str})
        self.annotaded_data = df.loc[df['text'] != 'no_one']
        self.root_img_dir = root_img_dir

        self.img_trans = transforms.Compose([
            transforms.Resize((img_height, img_width)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return self.annotaded_data.shape[0]

    def __getitem__(self, item):
        annotaded_item = self.annotaded_data.iloc[item]

        img_id = annotaded_item[0]
        img_path = self.root_img_dir + '/' + img_id + '.png'
        img = Image.open(img_path)

        width, height = img.size
        x0 = annotaded_item[1] * width
        y0 = annotaded_item[2] * height
        x1 = annotaded_item[3] * width
        y1 = annotaded_item[4] * height

        roi = img.crop((x0, y0, x1, y1))

        groundtruth = annotaded_item[5]
        groundtruth_label = torch.full((self.n_chars + 2, ),
                                       self.tokenizer.EOS_token,
                                       dtype=torch.long)
        ts = self.tokenizer.tokenize(groundtruth)
        groundtruth_label[:ts.shape[0]] = torch.tensor(ts)

        return self.img_trans(roi), groundtruth_label
Exemplo n.º 25
0
def parse_document(document, type='single'):
    document = document.replace('\r', '').replace('\n', '')
    doc_num = str(re.search('<DOCNO>(.*?)</DOCNO>',
                            document).groups(1)).replace('(\'', '').replace(
                                '\',)', '').strip()
    doc_body = str(re.search('<TEXT>(.*?)</TEXT>', document).groups(1))
    processed_doc = pre_process(doc_body)
    tokens, positions = Tokenizer(processed_doc, type).tokenize_text()
    doc_len = len(tokens)
    return doc_num, tokens, positions, doc_len
Exemplo n.º 26
0
def load_data(seq_length, label):
    print("Start to load data.")
    start_time = time.time()
    # emb: collections.OrderedDict()顺序字典
    ## key:词(str类型), value:词向量(list类型,元素为float)
    # dict_length: 字典大小
    # emb_size: 词向量的维数
    emb, dict_length, emb_size = get_emb()
    # 用所有的词(str类型)实例化一个tokenizer
    tokenizer = Tokenizer(emb.keys())
    # emb_matrix: ID与词向量的对应的矩阵
    ## ID: 每种字对应一个ID号,比如“的”1号,“是”2号以此类推
    ## 矩阵第一维的坐标就是ID号,ID号这一行的向量即对应的词向量
    emb_matrix = get_emb_matrix(emb, tokenizer, dict_length, emb_size)

    # 生成ChnSentiCorp_Clf类的实例
    ## 类的构造函数已经将数据切分成训练数据和测试数据
    data_loader = Tnews_ChnCorp_Clf(label)
    # 获取训练数据
    ## list类型,以data_example类的实例为元素
    ## data_example类包含2个属性:text,str类型;label,str类型
    train_examples = data_loader.get_train_examples()
    # 获取验证数据
    ## 同train_examples
    dev_examples = data_loader.get_dev_examples()

    def generate_dataloader(examples, tokenizer, seq_length):
        """
        生成数据加载器
        :param examples: list类型,以data_example类的实例为元素。
                        data_example类包含2个属性:text,str类型;label_id,int类型
        :param tokenizer:
        :param seq_length: 一个样本/序列长度
        :return: dataloader,迭代器类型;
        """
        features = multi_convert_example_to_feature(examples, tokenizer, seq_length)
        # ids,tensor类型(转自list类型)
        # 每个元素代表一个样本的text文本对应的ID号序列,list类型
        # 一个字对应一个ID号
        ids = torch.tensor([f.ids for f in features], dtype=torch.long)
        # labels,tensor类型(转自list类型)
        # 每个元素是一个样本对应的标签ID号
        label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

        dataset = TensorDataset(ids, label_ids)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        return dataloader
           
    train_dataloader = generate_dataloader(train_examples, tokenizer, seq_length)
    dev_dataloader = generate_dataloader(dev_examples, tokenizer, seq_length)

    end_time = time.time()
    print("Data loading finishes. Time span: {:.2f}s".format(end_time - start_time))

    return emb_matrix, train_dataloader, dev_dataloader, tokenizer
Exemplo n.º 27
0
def train(config, device, RS='Supervised'):
    # Init tokenizer.
    tokenizer = Tokenizer(config.temp_dir, config.jieba_dict_file,
                          config.remove_stopwords, config.stopwords_file,
                          config.ivr)
    # Init feature index.
    feature_index = FeatureIndex(config, tokenizer=tokenizer)
    file_list = [config.labeled_file]
    if config.extra_train_file is not None:
        file_list.append(config.extra_train_file)
    if config.valid_file is not None:
        file_list.append(config.valid_file)
    feature_index.build_index(file_list)
    # Preprocess data.
    pre_process = PreProcess(config)
    train_data_dir, valid_data_dir, final_train_file, final_valid_file = pre_process.train_preprocess(
    )
    # Get PyTorch dataset.
    train_dataset = MixnetDataset(config, train_data_dir, feature_index,
                                  tokenizer)
    valid_dataset = MixnetDataset(config, valid_data_dir, feature_index,
                                  tokenizer, True)
    # Get NER model if necessary and compatible.
    need_ner = False
    for (feature, feature_config) in config.feature_config_dict.items():
        need_ner = need_ner or ("text" in feature_config.get(
            "type", "") and feature_config.get("seg_type", "word") == "char"
                                and feature_config.get("ner", False))
    if need_ner:
        logger.info("Enable NER, loading NER model...")
        # Use predict mode since we cannot train it without tag information.
        ner_model = NERModel(device, "predict")
    else:
        logger.info("Disable NER.")
        ner_model = None
    # Get PyTorch data loader.
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=1,
                                   shuffle=False,
                                   num_workers=config.read_workers)
    valid_data_loader = DataLoader(valid_dataset,
                                   batch_size=1,
                                   shuffle=False,
                                   num_workers=config.read_workers)
    # Init model.
    model = MixNet(config.model_config_dict,
                   config.output_config_dict,
                   feature_index.feature_info_dict,
                   feature_index.label_info_dict,
                   ner_model=ner_model)
    # Train model.
    solver = Solver(config, train_data_loader, valid_data_loader,
                    feature_index, model, device, RS)
    solver.build()
    solver.train()
Exemplo n.º 28
0
    def __init__(self):

        self.tokenizer = Tokenizer()

        self.data_path = global_config.project_path + '/data/transcriptions'

        self.vocabularies = {"voc_l_1": [], "voc_l_2": []}

        self.correct_tokenizing = {"pairs_incorrect": 0, "pairs_correct": 0}

        data1, data2 = itertools.tee(self.data_loading(), 2)

        self.create_voc(data1)
        self.find_s_freq_th(data2)

        for el in self.correct_tokenizing:
            print(el, self.correct_tokenizing[el])

        for v in self.vocabularies:
            self.find_w_freq_th(self.vocabularies[v], v)
Exemplo n.º 29
0
def chunk_text(text, tokenize=False):
    splitter = SentenceSplitter()
    chunker = Chunker()
    if tokenize:
        # NOT YET FINISHED
        tokenized_text = Tokenizer(text).tokenize_text()
    else:
        text = text.lower()
        sentences = splitter.split(text)
        chunker.chunk(sentences, len(text))
        #chunker.pp_chunks()
        return chunker.get_chunks()
Exemplo n.º 30
0
    def read_queries(self, static):
        queries = []
        global number, title
        avg_query = 0
        with open(self.opt['query_dir'], 'r') as f:
            for line in f:
                if 'num' in line:
                    number = line.split("Number:", 1)[1].strip()
                elif 'title' in line and not self.opt['threshold']:
                    title = line.split("Topic:", 1)[1].strip()
                    if static:
                        qterms = Tokenizer(
                            title.lower(),
                            self.opt['index_type']).tokenize_text()
                    else:
                        qterms = Tokenizer(title.lower(),
                                           'single').tokenize_text()
                    q = Query(title.lower(), number,
                              [q.lower() for q in qterms[0]])
                    if '/' in title:
                        q.split_slash()
                    queries.append(q)

                elif 'narr' in line and self.opt['threshold']:
                    line = f.readline()
                    narrative = ''
                    while '</top>' not in line:
                        narrative += line.replace('\n', '')
                        line = f.readline()
                    qterms = Tokenizer(narrative.lower().strip(),
                                       'single').tokenize_text()
                    avg_query += len(qterms[0])
                    q = Query(narrative.lower(), number,
                              [q.lower() for q in qterms[0]])
                    queries.append(q)
        if (self.opt['threshold']):
            print('Average Query Length before reduction: {0:.2f}'.format(
                avg_query / float(len(queries))))
        return queries
Exemplo n.º 31
0
def load_dataset(params):
    tokenizer = Tokenizer()
    tokenizer.load()

    # Датасет должен быть заранее сформирован скриптом ./preparation/prepare_req_interpretation_classif.py
    df = pd.read_csv(os.path.join(data_folder,
                                  'req_interpretation_dataset.csv'),
                     sep='\t',
                     encoding='utf-8')
    samples = [
        Sample(row['text'], int(row['label'])) for i, row in df.iterrows()
    ]

    # Токенизация сэмплов
    for sample in samples:
        sample.words = tokenizer.tokenize(sample.phrase)

    nb_0 = sum(sample.y == 0 for sample in samples)
    nb_1 = sum(sample.y == 1 for sample in samples)
    logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1))

    max_wordseq_len = max(len(sample.words) for sample in samples)
    logging.info('max_wordseq_len={}'.format(max_wordseq_len))

    if params['padding'] == 'left':
        for sample in samples:
            sample.words = lpad_wordseq(sample.words, max_wordseq_len)
    else:
        for sample in samples:
            sample.words = rpad_wordseq(sample.words, max_wordseq_len)

    computed_params = {
        'max_wordseq_len': max_wordseq_len,
        'nb_0': nb_0,
        'nb_1': nb_1
    }

    return samples, computed_params
Exemplo n.º 32
0
 def __init__(self,
              config,
              restart,
              frontier_factory=Frontier,
              worker_factory=Worker,
              subdomain_printer_factory=SubDomainPrinter,
              tokenizer_factory=Tokenizer):
     self.config = config
     self.logger = get_logger("CRAWLER")
     self.frontier = frontier_factory(config, restart)
     self.workers = list()
     self.worker_factory = worker_factory
     self.subdomain_printer = SubDomainPrinter(config, restart)
     self.tokenizer = Tokenizer(config, restart)
Exemplo n.º 33
0
 def __init__(self, instruction_set, registers):
     self.instruction_names = [
         inst.__name__
         for opcode, inst in instruction_set
     ]
     self.instruction_mapping = {
         inst.__name__: (opcode, inst)
         for opcode, inst in instruction_set
     }
     self.macro_names = ['DAT', 'DATN']
     self.word_registers = registers['word']
     self.byte_registers = registers['byte']
     self.keywords = set(self.instruction_names + self.macro_names + self.word_registers + self.byte_registers)
     self.tokenizer = Tokenizer({
         'instruction_names': self.instruction_names,
         'macro_names': self.macro_names,
         'word_registers': self.word_registers,
         'byte_registers': self.byte_registers,
     })
     self._reset_state()
Exemplo n.º 34
0
class TestTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = Tokenizer({
            'instruction_names': ['MOV', 'JMP'],
            'macro_names': ['DAT'],
            'word_registers': WORD_REGISTERS,
            'byte_registers': BYTE_REGISTERS,
        })

    def test_label_alone(self):
        tokens = self.tokenizer.tokenize('labelname:')
        self.assertListEqual(tokens, [
            Token(TokenType.LABEL, 'labelname', 0),
        ])

    def test_code_alone(self):
        tokens = self.tokenizer.tokenize('mov ax 0x0100')
        self.assertListEqual(tokens, [
            Token(TokenType.INSTRUCTION, 'MOV', 0),
            Token(TokenType.WORD_REGISTER, 'AX', 4),
            Token(TokenType.WORD_LITERAL, 256, 7),
        ])

    def test_label_code_comment(self):
        tokens = self.tokenizer.tokenize('labelname: mov ax 0x0100  # comment text')
        self.assertListEqual(tokens, [
            Token(TokenType.LABEL, 'labelname', 0),
            Token(TokenType.INSTRUCTION, 'MOV', 11),
            Token(TokenType.WORD_REGISTER, 'AX', 15),
            Token(TokenType.WORD_LITERAL, 256, 18),
            Token(TokenType.COMMENT, ' comment text', 26),
        ])

    def test_random_case(self):
        tokens = self.tokenizer.tokenize('LabeL: mOv aX 0x00fF')
        self.assertListEqual(tokens, [
            Token(TokenType.LABEL, 'LabeL', 0),
            Token(TokenType.INSTRUCTION, 'MOV', 7),
            Token(TokenType.WORD_REGISTER, 'AX', 11),
            Token(TokenType.WORD_LITERAL, 255, 14),
        ])

    def test_random_whitespace(self):
        tokens = self.tokenizer.tokenize('			mov  	  ax      	   	    0x0100  			      ')
        self.assertListEqual(tokens, [
            Token(TokenType.INSTRUCTION, 'MOV', 3),
            Token(TokenType.WORD_REGISTER, 'AX', 11),
            Token(TokenType.WORD_LITERAL, 256, 28),
        ])

    def test_almost_keyword_identifiers(self):
        tokens = self.tokenizer.tokenize('MOVE AXE ALL BEEF FF')
        self.assertListEqual(tokens, [
            Token(TokenType.IDENTIFIER, 'MOVE', 0),
            Token(TokenType.IDENTIFIER, 'AXE', 5),
            Token(TokenType.IDENTIFIER, 'ALL', 9),
            Token(TokenType.IDENTIFIER, 'BEEF', 13),
            Token(TokenType.IDENTIFIER, 'FF', 18),
        ])

    def test_every_token_type(self):
        tokens = self.tokenizer.tokenize('label: other_label: MOV AX AL 0x1234 0x12 ^0x1234 ^label [AX] [AX+0x12]B [AX-0x12]B [0x1234]B [label]B [0x1234+0x56] [label+0x56] [0x1234+AX] [label+AX] JMP third_label .DAT "hello world with kinda # comment"  # actual comment')
        self.assertListEqual(tokens, [
            Token(TokenType.LABEL, 'label', 0),
            Token(TokenType.LABEL, 'other_label', 7),
            Token(TokenType.INSTRUCTION, 'MOV', 20),
            Token(TokenType.WORD_REGISTER, 'AX', 24),
            Token(TokenType.BYTE_REGISTER, 'AL', 27),
            Token(TokenType.WORD_LITERAL, 4660, 30),
            Token(TokenType.BYTE_LITERAL, 18, 37),
            Token(TokenType.ADDRESS_WORD_LITERAL, 4660, 42),
            Token(TokenType.ADDRESS_LABEL, 'label', 50),
            Token(TokenType.ABS_REF_REG, Reference('AX', 0, 'W'), 57),
            Token(TokenType.ABS_REF_REG, Reference('AX', 18, 'B'), 62),
            Token(TokenType.ABS_REF_REG, Reference('AX', -18, 'B'), 73),
            Token(TokenType.REL_REF_WORD, Reference(4660, None, 'B'), 84),
            Token(TokenType.REL_REF_LABEL, Reference('label', None, 'B'), 94),
            Token(TokenType.REL_REF_WORD_BYTE, Reference(4660, 86, 'W'), 103),
            Token(TokenType.REL_REF_LABEL_BYTE, Reference('label', 86, 'W'), 117),
            Token(TokenType.REL_REF_WORD_REG, Reference(4660, 'AX', 'W'), 130),
            Token(TokenType.REL_REF_LABEL_REG, Reference('label', 'AX', 'W'), 142),
            Token(TokenType.INSTRUCTION, 'JMP', 153),
            Token(TokenType.IDENTIFIER, 'third_label', 157),
            Token(TokenType.MACRO, 'DAT', 169),
            Token(TokenType.STRING_LITERAL, 'hello world with kinda # comment', 174),
            Token(TokenType.COMMENT, ' actual comment', 210)
        ])

    def test_error_unexpected_char(self):
        with self.assertRaises(UnexpectedCharacterError):
            self.tokenizer.tokenize('label: mov ?')

    def test_error_invalid_string_literal(self):
        with self.assertRaises(InvalidStringLiteralError):
            self.tokenizer.tokenize('label: mov \'single quote \\\' between single quotes\'')

    def test_error_unknown_macro(self):
        with self.assertRaises(UnknownMacroError):
            self.tokenizer.tokenize('label: .mac x')
Exemplo n.º 35
0
    # они нужны, чтобы не рассматривать предложения, содержащие
    # искаженную лексику и т.д.
    rx1 = re.compile(u'[абвгдеёжзийклмнопрстуфхцчшщъыьэюя]+')
    dict_words = set()
    with zipfile.ZipFile(os.path.join(data_folder, 'ruwords.txt.zip')) as z:
        with z.open('ruwords.txt') as rdr:
            for line in rdr:
                word = line.decode('utf-8').strip()
                if rx1.match(word) is not None:
                    dict_words.add(word)

    uniq_phrases = set()
    phrases = []
    all_words = set()

    tokenizer = Tokenizer()

    for corpus_filepath in glob.glob(os.path.join(data_folder, r'e:\MVoice\lem\dictionary.src\corpus\syntax-ru.*.xml')):
        print(u'Parsing {}'.format(corpus_filepath))
        with codecs.open(corpus_filepath, 'r', 'utf-8') as rdr:
            for line in rdr:
                if line.startswith(u'<text>'):
                    line = line.replace(u'<text>', u'').replace(u'</text>', u'').strip()
                    if line not in uniq_phrases:
                        uniq_phrases.add(line)

                        words = tokenizer.tokenize(line)
                        if len(words) <= MAX_SENT_LEN:
                            all_words_known = True
                            for word in words:
                                if word not in dict_words:
Exemplo n.º 36
0
class Assembler:
    '''
    Assembler
    '''

    def __init__(self, instruction_set, registers):
        self.instruction_names = [
            inst.__name__
            for opcode, inst in instruction_set
        ]
        self.instruction_mapping = {
            inst.__name__: (opcode, inst)
            for opcode, inst in instruction_set
        }
        self.macro_names = ['DAT', 'DATN']
        self.word_registers = registers['word']
        self.byte_registers = registers['byte']
        self.keywords = set(self.instruction_names + self.macro_names + self.word_registers + self.byte_registers)
        self.tokenizer = Tokenizer({
            'instruction_names': self.instruction_names,
            'macro_names': self.macro_names,
            'word_registers': self.word_registers,
            'byte_registers': self.byte_registers,
        })
        self._reset_state()

    def assemble_file(self, filename):
        '''
        Assemble source code file and write to executable file
        '''
        logger.info('Assembling %s...', filename)
        source_code = ''
        with open(filename, 'rt') as input_file:
            source_code = input_file.read()
        opcode = self.assemble_code(source_code)
        binary_filename = os.path.splitext(filename)[0]
        exe = Executable(1, opcode)
        exe.save_to_file(binary_filename)
        logger.info('Assembled %s (%d bytes).', binary_filename, exe.length)

    def assemble_code(self, source_code):
        '''
        Assemble source code and return opcode
        '''
        self._reset_state()
        self.source_code = source_code
        logger.debug('Tokenizing...')
        tokenized_code = self._tokenize()
        logger.debug('Tokenized.')
        logger.debug('Collecting labels...')
        self._collect_labels(tokenized_code)
        logger.debug('Collected.')
        logger.debug('Generating opcode...')
        self._generate_opcode(tokenized_code)  # generate opcode first time: label addresses not yet good
        self._generate_opcode(tokenized_code)  # generate opcode second time: label addresses good
        logger.debug('Generated.')
        self._log_code()
        return self.opcode

    def _log_code(self):
        logger.debug('===CODE===')
        max_line_opcode_length = max(
            len(line_opcode)
            for line_number, opcode_pos, line_opcode, source_line, tokens in self.augmented_opcode
        )
        if max_line_opcode_length > MAX_LINE_OPCODE_LENGTH:
            max_line_opcode_length = MAX_LINE_OPCODE_LENGTH
        for line_number, opcode_pos, line_opcode, source_line, tokens in self.augmented_opcode:
            line_label_names = [token.value for token in tokens if token.type == TokenType.LABEL]
            if not line_label_names and not line_opcode:
                continue
            logger.debug(
                ' '.join([
                    '{:4}'.format(line_number),
                    utils.word_to_str(opcode_pos),
                    ' '.join([utils.byte_to_str(op) for op in line_opcode]),
                    ' ' if line_opcode else '',
                    '   ' * (max_line_opcode_length - len(line_opcode)),
                    source_line,
                ])
            )

    def _reset_state(self):
        self.source_code = ''
        self.labels = {}
        self.opcode = []
        self.augmented_opcode = []

    def _tokenize(self):
        tokenized_code = []
        for idx, source_line in enumerate(self.source_code.split('\n')):
            line_number = idx + 1
            try:
                meaningful_tokens = [
                    token
                    for token in self.tokenizer.tokenize(source_line)
                    if token.type != TokenType.COMMENT
                ]
            except AldebaranError as ex:
                msg, pos = ex.args
                _raise_error(source_line, line_number, pos, str(msg), ex.__class__)
            tokenized_code.append((
                line_number,
                source_line,
                meaningful_tokens,
            ))
        return tokenized_code

    def _collect_labels(self, tokenized_code):
        for line_number, source_line, tokens in tokenized_code:
            for token in tokens:
                if token.type == TokenType.LABEL:
                    label_name = token.value
                    if label_name in self.labels:
                        _raise_error(source_line, line_number, token.pos, 'Label already defined', LabelError)
                    if label_name in self.keywords:
                        _raise_error(source_line, line_number, token.pos, 'Label name cannot be keyword', LabelError)
                    self.labels[label_name] = 0

    def _generate_opcode(self, tokenized_code):
        self.opcode = []
        self.augmented_opcode = []
        opcode_pos = 0
        for line_number, source_line, tokens in tokenized_code:
            line_opcode = self._parse_line(line_number, source_line, tokens, opcode_pos)
            self.opcode += line_opcode
            self.augmented_opcode.append((
                line_number,
                opcode_pos,
                line_opcode,
                source_line,
                tokens,
            ))
            opcode_pos += len(line_opcode)

    def _parse_line(self, line_number, source_line, tokens, opcode_pos):
        state = ParserState.LABEL
        inst_name = None
        macro_name = None
        args = []
        for token in tokens:
            if state == ParserState.LABEL:
                if token.type == TokenType.LABEL:
                    self.labels[token.value] = opcode_pos
                elif token.type == TokenType.INSTRUCTION:
                    state = ParserState.INSTRUCTION
                    inst_name = token.value
                elif token.type == TokenType.MACRO:
                    state = ParserState.MACRO
                    macro_name = token.value
                else:
                    _raise_error(source_line, line_number, token.pos, 'Unexpected token: {}'.format(token.value), ParserError)
            elif state == ParserState.INSTRUCTION:
                if token.type in ARGUMENT_TYPES:
                    args.append(token)
                else:
                    _raise_error(source_line, line_number, token.pos, 'Unexpected token: {}'.format(token.value), ParserError)
            elif state == ParserState.MACRO:
                if token.type in ARGUMENT_TYPES:
                    args.append(token)
                else:
                    _raise_error(source_line, line_number, token.pos, 'Unexpected token: {}'.format(token.value), ParserError)
            elif state == ParserState.ARGUMENTS:
                if token.type in ARGUMENT_TYPES:
                    args.append(token)
                else:
                    _raise_error(source_line, line_number, token.pos, 'Unexpected token: {}'.format(token.value), ParserError)
            else:
                _raise_error(source_line, line_number, token.pos, 'Unknown parser state: {}'.format(state), ParserError)
        if inst_name is not None:
            line_opcode = self._parse_instruction(inst_name, args, source_line, line_number, opcode_pos)
        elif macro_name is not None:
            line_opcode = self._parse_macro(macro_name, args, source_line, line_number, opcode_pos)
        else:
            line_opcode = []
        return line_opcode

    def _parse_instruction(self, inst_name, args, source_line, line_number, opcode_pos):
        inst_opcode, inst = self.instruction_mapping[inst_name]
        operands = self._parse_operands(args, source_line, line_number, opcode_pos)
        if len(operands) < inst.operand_count:
            _raise_error(source_line, line_number, None, 'Not enough operands: {} instead of {}'.format(len(operands), inst.operand_count), OperandError)
        if len(operands) > inst.operand_count:
            _raise_error(source_line, line_number, None, 'Too many operands: {} instead of {}'.format(len(operands), inst.operand_count), OperandError)
        # TODO: check inst.oplens
        # if None: no check
        # otherwise list of strings of B|W|*
        opcode = [inst_opcode]
        for operand_opcode in operands:
            opcode += operand_opcode
        return opcode

    def _parse_macro(self, macro_name, args, source_line, line_number, opcode_pos):
        if macro_name == 'DAT':
            opcode = []
            for arg in args:
                if arg.type == TokenType.STRING_LITERAL:
                    opcode += list(arg.value.encode('utf-8'))
                elif arg.type == TokenType.BYTE_LITERAL:
                    opcode.append(arg.value)
                elif arg.type == TokenType.WORD_LITERAL:
                    opcode += utils.word_to_binary(arg.value)
                else:
                    _raise_error(source_line, line_number, arg.pos, 'Parameter of macro DAT must be a byte, word or string literal, not {}'.format(arg.type), MacroError)
            return opcode
        if macro_name == 'DATN':
            if len(args) != 2:
                _raise_error(source_line, line_number, None, 'Macro DATN requires exactly 2 parameters, not {}'.format(len(args)), MacroError)
            repeat_arg, value_arg = args
            if repeat_arg.type not in {TokenType.BYTE_LITERAL, TokenType.WORD_LITERAL}:
                _raise_error(source_line, line_number, repeat_arg.pos, 'The first parameter of macro DATN must be a byte or word literal, not {}'.format(repeat_arg.type), MacroError)
            repeat_number = repeat_arg.value
            if value_arg.type not in {TokenType.BYTE_LITERAL, TokenType.WORD_LITERAL, TokenType.STRING_LITERAL}:
                _raise_error(source_line, line_number, value_arg.pos, 'The second parameter of macro DATN must be a byte, word or string literal, not {}'.format(value_arg.type), MacroError)
            opcode = []
            for _ in range(repeat_number):
                if value_arg.type == TokenType.STRING_LITERAL:
                    opcode += list(value_arg.value.encode('utf-8'))
                elif value_arg.type == TokenType.BYTE_LITERAL:
                    opcode.append(value_arg.value)
                else:
                    opcode += utils.word_to_binary(value_arg.value)
            return opcode
        # TODO: add more macros
        _raise_error(source_line, line_number, None, 'Unknown macro: {}'.format(macro_name), MacroError)

    def _parse_operands(self, args, source_line, line_number, opcode_pos):
        operands = []
        for arg in args:
            if arg.type == TokenType.STRING_LITERAL:
                _raise_error(source_line, line_number, arg.pos, 'String literal cannot be instruction operand: {}'.format(arg.value), OperandError)
            if arg.type in LABEL_REFERENCE_TYPES:
                arg = self._substitute_label(arg, source_line, line_number, opcode_pos)
            try:
                operands.append(get_operand_opcode(arg))
            except AldebaranError as ex:
                orig_msg = '{}({})'.format(
                    ex.__class__.__name__,
                    str(ex),
                )
                arg_name = '{}({})'.format(
                    arg.type.name,
                    arg.value,
                )
                _raise_error(
                    source_line,
                    line_number,
                    arg.pos,
                    'Could not parse operand {} due to {}'.format(arg_name, orig_msg),
                    OperandError,
                )
        return operands

    def _substitute_label(self, arg, source_line, line_number, opcode_pos):
        assert arg.type in LABEL_REFERENCE_TYPES
        if arg.type == TokenType.ADDRESS_LABEL or arg.type == TokenType.IDENTIFIER:
            label_name = arg.value
        else:
            label_name = arg.value.base
        try:
            label_address = self.labels[label_name]
        except KeyError:
            _raise_error(source_line, line_number, arg.pos, 'Unknown label reference: {}'.format(arg.value), LabelError)
        relative_address = label_address - opcode_pos
        new_type = {
            TokenType.ADDRESS_LABEL: TokenType.ADDRESS_WORD_LITERAL,
            TokenType.IDENTIFIER: TokenType.ADDRESS_WORD_LITERAL,
            TokenType.REL_REF_LABEL_REG: TokenType.REL_REF_WORD_REG,
            TokenType.REL_REF_LABEL_BYTE: TokenType.REL_REF_WORD_BYTE,
            TokenType.REL_REF_LABEL: TokenType.REL_REF_WORD,
        }[arg.type]
        if arg.type == TokenType.ADDRESS_LABEL or arg.type == TokenType.IDENTIFIER:
            new_value = relative_address
        else:
            new_value = Reference(relative_address, arg.value.offset, arg.value.length)
        return Token(
            new_type,
            new_value,
            arg.pos,
        )