示例#1
0
    def setUp(self):
        self.inst = raw_txt_to_inst("balhd       dfds dkkdf ldf\n" +
                                    "experiment  this is      a\n" +
                                    "this        is    a      test")

        self.gloss_tokens = tokenize_string("experiment  this is      a")
        self.trans_tokens = tokenize_string("this        is    a      test")

        self.a = Alignment([(2,3),(1,2),(3,4)])
        self.a2 = Alignment([(2,3),(1,2),(3,4),(4,1)])
示例#2
0
文件: parsing.py 项目: rgeorgi/intent
def create_words_tier_from_string(string):
    tokens = tokenize_string(string, tokenizer=whitespace_tokenizer)
    wt = Tier(type=WORDS_TYPE)
    for token in tokens:
        i = Item(id=ask_item_id(wt), text=token.value())
        wt.append(i)
    return wt
示例#3
0
 def add_word_tag(self, word, tag, prev_word=None, next_word=None, count=1):
     for subword in tokenize_string(word, tokenizer=morpheme_tokenizer):
         subword = subword.seq
         if tag not in self.sw_dict[subword].keys():
             self.sw_dict[subword][tag] = {'contexts':[(prev_word,next_word)], 'count':count}
         else:
             self.sw_dict[subword][tag]['contexts'].append((prev_word, next_word))
             self.sw_dict[subword][tag]['count'] += count
示例#4
0
def remove_tags(source_path, target_path):
    source_f = open(source_path, 'r', encoding='utf-8')
    target_f = open(target_path, 'w', encoding='utf-8')

    for line in source_f:
        tokens = tokenize_string(line, tokenizer=tag_tokenizer)
        target_f.write(tokens.text()+'\n')

    source_f.close()
    target_f.close()
示例#5
0
    def clean_new_trans_test(self):
        orig = '"I don\'t understand any of it; I don\'t understand it at all"'
        expected = " I don\'t understand any of it; I don\'t understand it at all "
        result = clean_trans_string(orig)

        self.assertEqual(result, expected)

        tokenized          = tokenize_string(result, tokenizer=sentence_tokenizer).text()
        tokenized_expected = "I don't understand any of it ; I don't understand it at all"
        self.assertEqual(tokenized, tokenized_expected)
示例#6
0
    def from_giza_lines(cls, tgt, aln):
        """
        Return the target-to-source alignment from the target and aln lines
        of giza.
        """
        # Start by getting the target tokens from the provided target line
        tgt_tokens = tokenize_string(tgt, whitespace_tokenizer)

        # next, read the alignments from the aln line.
        a = Alignment.from_giza(aln)

        # Finally, the source tokens are also on the aln line.
        alignments = re.findall('(\S+) \(\{(.*?)\}\)', aln)

        # Get the src tokens...
        src_tokens = [a[0] for a in alignments[1:]]

        # And create the aln sent.
        aln_sent = cls(src_tokens, tgt_tokens, a)
        return aln_sent
示例#7
0
def stanford_stdout_handler(output, queue):
    queue.append(tokenize_string(output, tokenizer=tag_tokenizer))
示例#8
0
    def runTest(self):
        s1 = tokenize_string('This is a test sentence')
        s2 = tokenize_string('test sentence this is')
        a = Alignment([(1,3),(2,4),(4,1),(5,2)])

        a_sent = AlignedSent(s1, s2, a)