Пример #1
0
def ext_json():
    rdfUrl = ''
    tok = Tokenizer()
    if request.method == 'POST':
        rdf = request.form['data']
        status_test = "0"#request.form['status']
        filters = ""#request.form['exculdeurls']
        #rdf = "http://jpp.no-ip.org/MAD_J.rdf"
        try:
            #r = requests.get(rdf)
            gg = Graph()
            #g.load(rdfUrl)
            rdf_content = StringIO.StringIO(rdf.encode('utf-8'))
            #print rdf_content.readline()
            gg.parse(rdf_content,  format="xml")
            ext = Extractor(gg)
            uris = ext.getUris()
            mapping = MapFactory()
            for uri in uris:
                term = tok.tokenized_url(uri)
                uri_status = ""
                if status_test == "1":
                    uri_status = ext.testUri(uri)
                else:
                    uri_status = "N/A"  
                uri_lookup = str(uri)+"\"" 
                lnum = ext.get_lines(rdf_content, uri_lookup)          
                ent = MapEntry(uri, term, "", lnum, uri_status)
                mapping.add(ent)
            jsonized_result = json.dumps(mapping.get())              
            return Response(jsonized_result, mimetype='application/json')
        except requests.exceptions.ConnectionError:
            X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!'
            return X2Rwarning
Пример #2
0
def main():
    ## args
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--reviews', required=True, help='Review data file')
    parser.add_argument('-o', '--out', required=True, help='Inverted index output file')
    parser.add_argument('-s', '--stop', required=True, help='Stopword list')
    opts = parser.parse_args()

    ## Output file
    csv_writer = csv.writer(open(opts.out, 'w'), delimiter="\t")
    csv_writer.writerow(['token', 'business_id', 'review_id', 'position', '...'])

    ## Tokenizer
    tk = Tokenizer(opts.stop)
    token_map = defaultdict(list)

    ## Tokenize review texts
    # for each word in the vocabulary (in this case all words found in all reviews):
    # business id, review id, and position of each term occurrence
    # instead of using the review id, uses the line on which the review occurs as a unique identifier
    reviews = open(opts.reviews)
    for review_num, line in enumerate(reviews):
        review = json.loads(line)
        business_id = review['business_id'].encode('utf-8')
        tokens = tk.tokenize(review['text'])
        for position, word in enumerate(tokens):
            token_map[word].append((business_id, review_num, position))

    ## Print sorted inverted index
    for token in sorted(token_map):
        row = [token]
        row.extend(token_map[token])
        csv_writer.writerow(row)
Пример #3
0
 def _tokenize_tweet(self, tweet):
     """
     Input: tweet (String)
     Output: List of tokens
     """
     tok = Tokenizer(preserve_case=False)
     return tok.tokenize(tweet)
Пример #4
0
Файл: cli.py Проект: jwilk/jtc
def main(args):
    try:
        (opts, args) = getopt(args, "o:TPX")
    except GetoptError:
        usage()
    if len(args) != 1:
        usage()

    from tokenizer import Tokenizer
    from parser import Parser
    from error import JtError
    import context
    from os.path import abspath

    filename = abspath(args[0])
    stdin = file(filename, "r")
    target = "P"
    stdout = sys.stdout
    for (ok, ov) in opts:
        if ok in ("-T", "-P", "-X"):
            target = ok[1]
        elif ok == "-o":
            stdout = file(ov, "w")
    contents = stdin.read()
    tokenizer = Tokenizer()
    tokenizer.build()
    tokenizer.input(contents)
    parser = Parser(tokenizer)
    result_tree = None
    try:
        result_tree = parser.parse()
    except JtError, error:
        failure(error)
Пример #5
0
 def execute(self):
     if len(self.proj_paths) > 0:
         logging.info('Starting tokenizer. Producibles (logs, output, etc) can be found under the name '+self.target_folders)
         tokenizer = Tokenizer(self.proj_paths, self.DB_user, self.DB_pass, self.DB_name, logging, self.logs_folder, self.output_folder, self.N_PROCESSES, self.BATCH_SIZE, self.PROJECTS_CONFIGURATION)
         tokenizer.execute()
     else:
         logging.warning('The list of new projects is empty (or these are already on the DB).')
Пример #6
0
    def tokenize(self, **kwargs):
        """
        Returns the tokenized string using a parser.
        """

        string_tokenizer = Tokenizer()

        return string_tokenizer.tokenize(kwargs.get("text"), kwargs.get("parser"))
Пример #7
0
def main():
    tok = Tokenizer()
    mapping = MapFactory()
    uris = ["http://abc.ee.ntu/alf_123", "http://sc.e.ncli.ABCdefGU"]
    for uri in uris:
        term = tok.tokenized_url(uri)
        ent = MapEntry(uri, term, "", "", "")
        mapping.add(ent)
    jsonized_result = json.dumps(mapping.get())   
    print jsonized_result   
Пример #8
0
    def interpret_line(self, line):
        tokenizer = Tokenizer()
        tokenizer.parse(line)

        first_token = tokenizer.getNextToken()
        if first_token.type == Token.NUMBER:
            self.lines[int(first_token.value)] = tokenizer.prog[tokenizer.pos:]
            self.sort_lines()
        else:
            self.run_line(line)
Пример #9
0
    def testExecutionTreeWithItemAssignment(self):

        c = ExpressionCompiler()
        tokenizer = Tokenizer()

        tokenizer.tokenize("A[B]= 1 + R")

        tokenizer.next()

        expr = c.compile(tokenizer)

        exec_tree = expr.get_execution_tree()

        print "Expression Tree %s\n" % (exec_tree)

        self.assertEqual(
            "( = ( item_assign ( literal A ) ( index ( literal B ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )", exec_tree
        )

        # a little bit more complex
        tokenizer.tokenize("A[B+(C*3)+1]= 1 + R")

        tokenizer.next()

        expr = c.compile(tokenizer)

        exec_tree = expr.get_execution_tree()

        print "Expression Tree %s\n" % (exec_tree)

        self.assertEqual(
            "( = ( item_assign ( literal A ) ( index ( + ( + ( literal B ) ( * ( literal C ) ( literal 3.0 ) ) ) ( literal 1.0 ) ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )",
            exec_tree,
        )
Пример #10
0
    def testEvaluateFactors(self):

        c = ExpressionCompiler()

        tokenizer = Tokenizer()
        tokenizer.tokenize("7*7")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(49.0, result)

        tokenizer.tokenize("7*7/7")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(7.0, result)
Пример #11
0
def main():
	# first read in the inverted index file
	parser = argparse.ArgumentParser()
	parser.add_argument('-index', required=True, help='Path to inverted index file')
	parser.add_argument('-business', required=False, help='Path to yelp business data json file', default="/course/cs1951a/pub/final/data/extracted/yelp_academic_dataset_business.json")
	opts = parser.parse_args()

	# Pre-processing
	f_index = open(opts.index,'r')
	print "loading index file..."
	wordsmap = {}
	# count = 0
	# for line in f_index:
	# 	count += 1
	# 	j_obj = json.load(line)
	# 	for k, v in j_obj.items():
	# 		wordsmap[k] = v
	# 	j_obj = None
	# 	if count % 100 == 0:
	# 		print count
	wordsmap = json.load(f_index)
	print "done"
	f_index.close()
	b_map = {}
	print "loading business file..."
	f_b = open(opts.business, 'r')
	line_num = 0
	for line in f_b:
		b_json = json.loads(line)
		b_map[str(line_num)]={"business_id":b_json['business_id'],"review_count":int(b_json['review_count']), "stars":float(b_json['stars'])}
		line_num += 1
	print "done"


	tokenizer = Tokenizer()
	# TODO: need to check error input  
	# Bug: c-d exit situation
	
	for line in sys.stdin:
		result = []
		line = line.strip('\n')
		if len(line)==0:
			continue
		elif line[0]=='"':
			line = line.strip('"')
			words = tokenizer.process_review(line)
			result = phrase_query(words, wordsmap)
		elif len(line.split())==1:
			words = tokenizer.process_review(line)
			result = one_word_query(words[0], wordsmap)
		else:
			words = tokenizer.process_review(line)
			result = free_text_query(words, wordsmap)
		rank_res = rank(words,result,b_map,wordsmap)
		print rank_res
Пример #12
0
 def __init__( self, string_to_tokenize = '', prefix_chars = '-=<>!+*&|/%^', suffix_chars = '=<>&|' ):
     Tokenizer.__init__( self, string_to_tokenize )
     self.prefix     =   prefix_chars
     self.suffix     =   suffix_chars
 ### Setup JavaScriptTokenizer-specific regexen
     self.PREFIX             =   re.compile( "[%s]" % self.prefix )
     self.SUFFIX             =   re.compile( "[%s]" % self.suffix )
     self.BEGIN_IDENTIFIER   =   self.CHARACTER
     self.MULTILINE_COMMENT  =   re.compile("[\*]")
     self.END_COMMENT        =   re.compile("[/]")
     self.ESCAPE             =   re.compile("[\\\\]")
Пример #13
0
def correct_macro_syntax_test():
    macro_string = """
!config {
output: pdf, html
table_of_contents: true
}"""
    tokenizer = Tokenizer(macro_string)
    for token in tokenizer:
        if token[0] == "!":
            open_brackets = tokenizer.next()
            if open_brackets != "{":
                raise DMLSyntaxError(open_brackets, "{")
Пример #14
0
    def test_ast_opts(self):
        a = AST()
        t = Tokenizer()
        opts = {}
        opts['get-me'] = 'I am superman'

        a.parse(t.parse('{{ opts.get("get-me") }}'))
        c = a.traverse(opts=opts)
        self.assertEqual(c.buffer, 'I am superman')

        a.parse(t.parse('{@ if opts.get("get-me"): @}I am superman{@ end @}'))
        c = a.traverse(opts=opts)
        self.assertEqual(c.buffer, 'I am superman')
Пример #15
0
	def __init__(self, _what, _who, _when, _where, _why, _how, _text):
		self.what = Tokenizer.removeNonAscii(_what).replace(".\"",". \"")
		self.who = Tokenizer.removeNonAscii(_who).replace(".\"",". \"")
		self.when = Tokenizer.removeNonAscii(_when).replace(".\"",". \"")
		self.where = Tokenizer.removeNonAscii(_where).replace(".\"",". \"")
		self.why = Tokenizer.removeNonAscii(_why).replace(".\"",". \"")
		self.how = Tokenizer.removeNonAscii(_how).replace(".\"",". \"")
		self.text = Tokenizer.removeNonAscii(_text).replace(".\"",". \"")
		self.sentences = Tokenizer.getSentences(self.text)
		self.tokenized_sentences = [Tokenizer.getTokens(sentence) for sentence in self.sentences]
Пример #16
0
def analyze(string):
    scanner = Tokenizer()
    list_of_tokens= scanner.tokenize(string)
    print "-------------"
    print "TOKEN LIST:"
    print list_of_tokens
    parser = QueryParser()
    print "----------------"
    print "PARSING RESULT"
    print "----------------"
    print parser.parse(list_of_tokens)

    semparser = QuerySemanticParser(parser.parse(list_of_tokens))
    semparser.parse()
Пример #17
0
    def __init__( self, string_to_tokenize = '' ):
        Tokenizer.__init__( self, string_to_tokenize )

    ### Setup CSSTokenizer-specific regexen
### Throwing everything away after reading through the CSS spec.
### I ought be using the specified tokens, so I will.
# IDENT {ident}
# ATKEYWORD @{ident}
# STRING    {string}
# INVALID   {invalid}
# HASH  #{name}
# NUMBER    {num}
# PERCENTAGE    {num}%
# DIMENSION {num}{ident}
# URI   url\({w}{string}{w}\)
# |url\({w}([!#$%&*-~]|{nonascii}|{escape})*{w}\)
# UNICODE-RANGE U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?
# CDO   <!--
# CDC   -->
# ; ;
# { \{
# } \}
# ( \(
# ) \)
# [ \[
# ] \]
# S [ \t\r\n\f]+
# COMMENT   \/\*[^*]*\*+([^/*][^*]*\*+)*\/
# FUNCTION  {ident}\(
# INCLUDES  ~=
# DASHMATCH |=
# DELIM any other character not matched by the above rules, and neither a single nor a double quote
# 
# 
# ident [-]?{nmstart}{nmchar}*
# name  {nmchar}+
# nmstart   [_a-z]|{nonascii}|{escape}
# nonascii  [^\0-\177]
# unicode   \\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?
# escape    {unicode}|\\[^\n\r\f0-9a-f]
# nmchar    [_a-z0-9-]|{nonascii}|{escape}
# num   [0-9]+|[0-9]*\.[0-9]+
# string    {string1}|{string2}
# string1   \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
# string2   \'([^\n\r\f\\']|\\{nl}|{escape})*\'
# invalid   {invalid1}|{invalid2}
# invalid1  \"([^\n\r\f\\"]|\\{nl}|{escape})*
# invalid2  \'([^\n\r\f\\']|\\{nl}|{escape})*
# nl    \n|\r\n|\r|\f
# w [ \t\r\n\f]*
Пример #18
0
    def testEvaluateNegation(self):

        c = ExpressionCompiler()
        tokenizer = Tokenizer()

        tokenizer.tokenize("not 0")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(1, result)
Пример #19
0
 def interpretStatement(self):
     tokens = Tokenizer(self.IR)
     instr = tokens.next().lower()
     stmt = ""
     while tokens.peek() is not None:
         stmt += tokens.next()
     if instr[0] == 's':
         self.interpretSet(stmt)
     elif instr[0] == 'j':
         if len(instr) == 5:
             self.interpretJumpt(stmt)
         elif len(instr) == 4:
             self.interpretJump(stmt)
     elif instr[0] == 'h':
         self.halt(tokens)
Пример #20
0
class Parser(object):
    def __init__(self, stmt):
        # We always wrap with ()'s
        self.tnz = Tokenizer('(' + stmt + ')')

    def pop(self):
        return self.tnz.pop()

    def peek(self):
        return self.tnz.peek()

    def top(self):
        return self.tnz.top()

    def parse(self, indent=0):
        indent = deepcopy(indent)
        indent += 1
        if istype(self.top(), 'Lparen'):
            self.pop()  # Open paren
            n = self.parse(indent)
            cp = self.pop()  # Close paren
            if istype(self.top(), 'Bop'):
                bopr = Node(self.pop(), indent)
                bopr.l_child = n
                bopr.r_child = self.parse(indent)
                return bopr
            else:
                return n
        if istype(self.top(), 'Term'):
            if istype(self.peek(), 'Bop'):
                t1 = Node(self.pop(), indent)
                bopr = Node(self.pop(), indent)
                bopr.l_child = t1
                if istype(self.top(), 'Term'):
                    bopr.r_child = self.parse(indent)
                elif istype(self.top(), 'Lparen'):
                    bopr.r_child = self.parse(indent)
                else:
                    raise SyntaxError("Expected Term or (")
                return bopr
            elif istype(self.peek(), 'Rparen'):
                t1 = Node(self.pop(), indent)
                return t1
            elif istype(self.peek(), 'Term'):
                t1 = Node(self.pop(), indent)
                return t1
            else:
                raise SyntaxError("Expecting term or (")
Пример #21
0
    def _classify(self, tokens, languages):
        """
        Internal: Guess language of data

        data      - Array of tokens or String data to analyze.
        languages - Array of language name Strings to restrict to.

        Returns sorted Array of result pairs. Each pair contains the
        String language name and a Float score.
        """
        if tokens is None:
            return []

        if isinstance(tokens, basestring):
            tokens = Tokenizer.tokenize(tokens)

        scores = {}
        if self.verbosity >= 2:
            self.dump_all_tokens(tokens, languages)
        for language in languages:
            scores[language] = self.tokens_probability(tokens, language) + self.language_probability(language)
            if self.verbosity >= 1:
                print '%10s = %10.3f + %7.3f = %10.3f\n' % (language,
                                                            self.tokens_probability(tokens, language),
                                                            self.language_probability(language),
                                                            scores[language])
        return sorted(scores.iteritems(), key=lambda t: t[1], reverse=True)
Пример #22
0
    def train(cls, db, language, data):
        """
        Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
        per-language.  See also dump_all_tokens, below.

        Public: Train classifier that data is a certain language.

          db       - Hash classifier database object
          language - String language of data
          data     - String contents of file

          Examples

            Classifier.train(db, 'Ruby', "def hello; end")

          Returns nothing.
        """
        tokens = Tokenizer.tokenize(data)
        db['tokens_total'] = db.get('tokens_total', 0)
        db['languages_total'] = db.get('languages_total', 0)
        db['tokens'] = db.get('tokens', {})
        db['language_tokens'] = db.get('language_tokens', {})
        db['languages'] = db.get('languages', {})

        for token in tokens:
            db['tokens'][language] = db['tokens'].get(language, {})
            db['tokens'][language][token] = db['tokens'][language].get(token, 0)
            db['tokens'][language][token] += 1
            db['language_tokens'][language] = db['language_tokens'].get(language, 0)
            db['language_tokens'][language] += 1
            db['tokens_total'] += 1

        db['languages'][language] = db['languages'].get(language, 0)
        db['languages'][language] += 1
        db['languages_total'] += 1
Пример #23
0
	def getOtherTaggedText(info):
		taggedtext = TextMarker.getTaggedText(info)
		# print taggedtext
		# print ''
		btags2 = ['B_WHAT', 'B_WHO', 'B_WHEN', 'B_WHERE', 'B_WHY', 'B_HOW']
		etags2 = ['E_WHAT', 'E_WHO', 'E_WHEN', 'E_WHERE', 'E_WHY', 'E_HOW']

		for i, tag in enumerate(btags2):
			taggedtext = taggedtext.replace(TextMarker.btags[i], tag)
		for i, tag in enumerate(etags2):
			taggedtext = taggedtext.replace(TextMarker.etags[i], tag)	

		text = ""
		state = 0
		for token in Tokenizer.getTokens(taggedtext):
			if (reduce( (lambda x, y: x or y), list(map((lambda x: x in token), btags2)) )):
				state += len([item for item in list(map((lambda x: x in token), btags2)) if item])
			if (state==0):
				# print "%s\t%s" % (state, TextMarker.othertags[0] + token + TextMarker.othertags[1])
				text += TextMarker.othertags[0] + token + TextMarker.othertags[1]
			else:
				# print "%s\t%s" % (state, token)
				text += token + " "
			if (reduce( (lambda x, y: x or y), list(map((lambda x: x in token), etags2)) )):
				state -= len([item for item in list(map((lambda x: x in token), etags2)) if item])

		for i, tag in enumerate(TextMarker.btags):
			text = text.replace(btags2[i], tag)
		for i, tag in enumerate(TextMarker.etags):
			text = text.replace(etags2[i], tag)	

		return text
Пример #24
0
class Preprocesser:

    def __init__(self, lower=True, punctuation=True, digits=True, stop=True, min_length=3,
                 pos_tag=False, lemmatization=True):

        self.lemma = lemmatization
        self.pos_tag = pos_tag

        self.tokenizer = Tokenizer(lower, punctuation, digits)
        self.token_filter = TokenFilter(stop, min_length)
        if pos_tag or lemmatization:
            self.postagger = Postagger()
            print dir(self.postagger)
        if lemmatization:
            self.Lemmatizer = Lemmatizer()

    def process(self, text):
        words = self.tokenizer.tokenize(text)
        words = self.token_filter.filter(words)
        if self.lemma:
            tags = self.postagger.tags2lemmatags(self.postagger.tags(words))
            result = self.Lemmatizer.lemma(words, tags)
        if self.pos_tag:
            tags = self.postagger.tags(words)
            result = tags
        return result
Пример #25
0
    def train(cls, db, language, data):
        """
        Public: Train classifier that data is a certain language.

          db       - Hash classifier database object
          language - String language of data
          data     - String contents of file

          Examples

            Classifier.train(db, 'Ruby', "def hello; end")

          Returns nothing.
        """
        tokens = Tokenizer.tokenize(data)
        db['tokens_total'] = db.get('tokens_total', 0)
        db['languages_total'] = db.get('languages_total', 0)
        db['tokens'] = db.get('tokens', {})
        db['language_tokens'] = db.get('language_tokens', {})
        db['languages'] = db.get('languages', {})

        for token in tokens:
            db['tokens'][language] = db['tokens'].get(language, {})
            db['tokens'][language][token] = db['tokens'][language].get(token, 0)
            db['tokens'][language][token] += 1
            db['language_tokens'][language] = db['language_tokens'].get(language, 0)
            db['language_tokens'][language] += 1
            db['tokens_total'] += 1

        db['languages'][language] = db['languages'].get(language, 0)
        db['languages'][language] += 1
        db['languages_total'] += 1
Пример #26
0
 def __init__(self, expression):
     """Initialize the parser by generating the token sequence"""
     self.sc = Scanner(expression)
     self.tok = Tokenizer(self.sc)
     self.tokens = None
     self.tokens = self.get_token_sequence()
     self.root = None
Пример #27
0
def simple_english_tokenizer(tokenizer=None):
    if not tokenizer:
        tokenizer   = Tokenizer()
    
    word        = tokenizer.type['Word']        = RegexTokenType(r'(\w+\'\w+|\w+)', priority=0)
    punctuation = tokenizer.type['Punctuation'] = RegexTokenType(r'([^\w\s%s]+)', priority=1)

    tokenizer.joins = {
        (punctuation,word,'\'')   : '',
        (punctuation,word)        : ' ',
        (punctuation,punctuation) : '',
        (word,word)               : ' ',
        None                      : ''
    }
    
    return tokenizer
Пример #28
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-review_file', required=True, help='Path to review data')
	parser.add_argument('-business_file', required=True, help='Path to business data')
	parser.add_argument('-output', required=True, help='Path to output index file')
	opts = parser.parse_args()
	f_reviews = open(opts.review_file,'r')
	f_business = open(opts.business_file,'r')

	line_num = 0
	b_map = {}
	for line in f_business:
		b_obj = json.loads(line)
		b_map[b_obj['business_id']] = line_num
		line_num += 1

	tokenizer = Tokenizer()
	wordsmap = {}
	line_num = 0
	for line in f_reviews:
		r = json.loads(line)
		words = tokenizer.process_review(r['text']);
		w_idx = 0
		for w in words:
			if w=="":
				continue
			b_id = b_map[r['business_id']]
			if w in wordsmap:
				if b_id in wordsmap:
					b_map = wordsmap[w][b_id]
					if line_num in b_map:
						b_map[line_num].append(w_idx)
					else:
						b_map[line_num] = [w_idx]
				else:
					wordsmap[w][b_id] = {line_num:[w_idx]}
			else:
				wordsmap[w] = {b_id:{line_num:[w_idx]}}
			w_idx += 1
		line_num += 1
		if line_num % 1000==0:
			print line_num 
		# if line_num == 1000:
		# 	break
		
	with open(opts.output, 'w') as f_out:
		json.dump(wordsmap, f_out)
Пример #29
0
class InvertedIndex():

    def __init__(self):
        self.invertedindex = {}
        self.lexicon = Lexicon()
        self.tokenizer = Tokenizer()
        self.doc_reader = DocReader()
        self.build_index()

    def build_index(self):
        #comments?
        cache = self.doc_reader.get_cache()
        docs = self.doc_reader.read_docs(cache)
        print "\nINVERTEDINDEX : Indexing %d documents..\n" % len(docs)
        for d in range(len(docs)):
            print "Indexing document '%s'" % (settings.PATH_DOCS + str(d))
            self.add_document(docs[d], d)

        print "Indexed total %d unique terms" % self.lexicon.size()

    def get_postinglist(self, lex_id):
        return self.invertedindex[lex_id]
            
    def add_document(self, doc, document_id):
        """FIXME: 
        -Needs doc 
        -Too slow?
        -Remove stop words
        -Reduce number of tokens
        """
        tokens = self.tokenizer.tokenize(doc)
        
        for t in tokens:
            lex_id = self.lexicon.lookup(t.get_value())

            if(lex_id == settings.INVALID):
                lex_id = self.lexicon.add_value(t.get_value())
                pl = PostingList()
                pl.append_posting(Posting(document_id, t.get_position()))
                self.invertedindex[lex_id] = pl
            else:
                pl = self.get_postinglist(lex_id)
    
            if pl.get_last_posting().get_document_id() != document_id:
                pl.append_posting(Posting(document_id, t.get_position()))
            else:
                p = pl.get_last_posting()
                p.append_position(t.get_position())
           
    def size(self):
        return len(self.invertedindex)

    def debugprint(self):
        voc = self.lexicon.get_vocabulary()
        for v in voc:
            lid = self.lexicon.lookup(v)
            pl = self.get_postinglist(lid)
            print "[%s]" % v
            pl.info()
Пример #30
0
 def process(a, s):
     infilename = a['src_filename']
     outfilename = Tokenizer.batch_tokenise(
         config['src_lang'], 
         config['moses_installation_dir'], 
         infilename, 
         config['src_tokenisation_dir'])
     return {'tokenised_src_filename':outfilename}
def rsd2ltf(rsd_str,
            doc_id,
            seg_option='linebreak',
            tok_option='unitok',
            re_segment=False):
    tokenizer = Tokenizer(seg_option, tok_option)

    if re_segment:
        # running segmentation and tokenization, then re-segment the tokenized
        # sentences (use space to concatenate tokens. this solves segmentation
        # problem, e.g. How are you?I'm fine.).
        # print('=> running segmentation...')
        sents = tokenizer.run_segmenter(rsd_str)
        # print('=> running tokenization...')
        raw_tokens = tokenizer.run_tokenizer(sents)

        # re-segment tokenized sentence
        num_sent_reseg = 0
        tokens = []
        for i, t in enumerate(raw_tokens):
            reseg = [
                item.split() for item in tokenizer.run_segmenter(' '.join(t))
            ]
            if len(reseg) > 1:
                num_sent_reseg += 1

            tokens += reseg

        # compute offset for each token
        indexer = 0
        token_offset = []
        for i, t in enumerate(itertools.chain(*tokens)):
            while not rsd_str[indexer:].startswith(t) and \
                            indexer < len(rsd_str):
                indexer += 1
            if indexer < len(rsd_str):
                t_start = indexer
                t_end = t_start + len(t) - 1
                assert rsd_str[t_start:t_end + 1] == t, \
                    "re_segment token offset not match %s-%d" % (doc_id, i)
                token_offset.append((t_start, t_end))
                indexer = t_end + 1

        assert len(token_offset) == len(list(itertools.chain(*tokens))), \
            "re_segment tokenization offset error in: %s" % doc_id

        # recover sent using tokens
        sents = []
        prev_token_end = token_offset[0][0] - 1
        token_index = 0
        for i, t in enumerate(tokens):
            sent = ''
            for j, item in enumerate(t):
                if j == 0:
                    prev_token_end = token_offset[token_index][0] - 1

                sent += ' ' * (token_offset[token_index][0] - prev_token_end -
                               1) + item

                prev_token_end = token_offset[token_index][1]

                token_index += 1

            assert sent in rsd_str, \
                're_segment sentence offset error.'

            sents.append(sent)

    else:
        # running segmentation and tokenization
        # print('=> running segmentation...')
        sents = tokenizer.run_segmenter(rsd_str)
        # print('=> running tokenization...')
        tokens = tokenizer.run_tokenizer(sents)

    # generate offset for sentences and tokens
    # print('=> generating offset...')
    indexer = 0
    sent_offset = []
    for i, s in enumerate(sents):
        while not rsd_str[indexer:].startswith(s) and indexer < len(rsd_str):
            indexer += 1
        if indexer < len(rsd_str):
            sent_start = indexer
            sent_end = sent_start + len(s) - 1
            assert rsd_str[sent_start:sent_end+1] == s, \
                "sentence offset not match %s-%d" % (doc_id, i)
            sent_offset.append((sent_start, sent_end))
            indexer = sent_end + 1

    assert len(sent_offset) == len(sents), \
        "sentence segmentation offset error in: %s" % doc_id

    token_offsets = []
    for i, tok in enumerate(tokens):
        sent_text = sents[i]
        indexer = 0
        t_offset = []
        for j, t in enumerate(tok):
            while not sent_text[indexer:].startswith(t) and \
                            indexer < len(sent_text):
                indexer += 1
            if indexer < len(sent_text):
                t_start = indexer
                t_end = t_start + len(t) - 1
                assert sent_text[t_start:t_end+1] == t, \
                    "token offset not match %s-%d-%d" % (doc_id, i, j)
                t_offset.append((t_start, t_end))
                indexer = t_end + 1
        token_offsets.append(t_offset)

        assert len(t_offset) == len(tok), \
            "tokenization offset error in: %s-%d" % (doc_id, i)

    # convert seg/tok result to ltf
    root = ET.Element('LCTL_TEXT')
    doc_element = ET.Element('DOC', {'id': doc_id})
    text_element = ET.Element('TEXT')
    root.append(doc_element)
    doc_element.append(text_element)

    for i in range(len(sents)):
        seg_text = sents[i]
        seg_start_char = sent_offset[i][0]
        seg_end_char = sent_offset[i][1]

        seg_id = '%s-%s' % (doc_id, str(i))

        seg_element = ET.Element(
            'SEG', {
                'id': seg_id,
                'start_char': str(seg_start_char),
                'end_char': str(seg_end_char)
            })
        original_text_element = ET.Element('ORIGINAL_TEXT')
        original_text_element.text = seg_text
        seg_element.append(original_text_element)

        for j in range(len(tokens[i])):
            token_id = 'token-%d-%d' % (i, j)
            tok_text = tokens[i][j]
            if not tok_text:
                continue
            tok_start_char = int(token_offsets[i][j][0]) + seg_start_char
            tok_end_char = int(token_offsets[i][j][1]) + seg_start_char

            assert rsd_str[tok_start_char:tok_end_char + 1] == tok_text

            token_element = ET.Element(
                'TOKEN', {
                    'id': token_id,
                    'start_char': str(tok_start_char),
                    'end_char': str(tok_end_char)
                })
            token_element.text = tok_text
            seg_element.append(token_element)

        text_element.append(seg_element)

    return root
Пример #32
0
from collections import Counter
import numpy as np

sentences = np.genfromtxt('../upsampled/x_QIT.txt', delimiter='\n', dtype=str)
language = 'italian'
max_words = None
max_length = 25

# Text preprocessor with no functionalities whatsoever
prep = TextPreprocessor(sentences)

# Add decorator to clean email bodies
prep = QITEmailBodyCleaner(prep)

# Add tokenizer decorator
prep = Tokenizer(prep, language)

# Load vocabulary
with open('vocabulary_wikipedia', 'r') as vocabulary_file:
    vocabulary = eval(vocabulary_file.read())

# Add integer encoding decorator
unknown_token_id = max(vocabulary.values()) + 1
prep = IntegerEncoder(prep, vocabulary, unknown_token_id)

# Add padding decorator
padding_token_id = max(vocabulary.values()) + 2
prep = Padder(prep, padding_token_id, max_length)

# Get final tokens
final_tokens = prep.preprocess()
class TokenizerTest(unittest.TestCase):
  """Unit test case suite for our tokenizers in our Tokenizer class."""

  def setUp(self):
    """General setup for configuration files."""
    # configuration for human readable Tokenizer
    human_readable_config = config_pb2.Config()
    human_readable_config.clusterer.tokenizer.token_min_length = 2
    human_readable_config.clusterer.tokenizer.mode = config_pb2.Tokenizer.TokenizerMode.HUMAN_READABLE
    human_readable_config.clusterer.tokenizer.split_on.extend(['='])
    human_readable_config.clusterer.tokenizer.punctuation.extend(
        [':', '/', '\n', '\t'])
    self.human_readable_tokenizer = Tokenizer(human_readable_config)

    # configuration for stack trace Tokenizer
    stack_trace_config = config_pb2.Config()
    stack_trace_config.clusterer.tokenizer.token_min_length = 0
    stack_trace_config.clusterer.tokenizer.mode = config_pb2.Tokenizer.TokenizerMode.STACK_TRACE_LINES
    self.stack_trace_tokenizer = Tokenizer(stack_trace_config)

    ignore_test_config = config_pb2.Config()
    ignore_test_config.clusterer.tokenizer.token_min_length = 2
    ignore_test_config.clusterer.tokenizer.mode = config_pb2.Tokenizer.TokenizerMode.HUMAN_READABLE
    ignore_test_config.clusterer.tokenizer.ignore_token_matcher.extend(
        ['uselessInfo'])
    self.ignore_test_config = Tokenizer(ignore_test_config)
    super(TokenizerTest, self).setUp()

  def test_human_readable_tokenizer(self):
    """Test suite for human_readable_tokenizer."""
    # our tokenizer gets rid of sequences of numbers and keeps 'words'
    simple_string = 'subscription id 11444512 failed because it was cancelled'
    simple_tokens = [
        'subscription', 'id', 'failed', 'because', 'it', 'was', 'cancelled'
    ]
    self.assertEqual(
        self.human_readable_tokenizer.human_readable_tokenizer(simple_string),
        simple_tokens)

    # our configured tokenizer also splits on '=',
    # extracting subscription=1114125 to subscription,
    # 1114125 the later of which is removed
    extra_split_test = 'subscription=1114125 failed because of id=1124125 from client=STADIA'
    split_tokens = [
        'subscription', 'failed', 'because', 'of', 'id', 'from', 'client',
        'stadia'
    ]
    self.assertEqual(
        self.human_readable_tokenizer.human_readable_tokenizer(
            extra_split_test), split_tokens)

    # Extracting useful text & removing stack lines test
    stack_trace = open('testdata/tokenizer/human_readable_trace.txt').read()
    stack_trace_tokens = [
        'some', 'hopefully', 'useful', 'english', 'text', 'here'
    ]
    self.assertEqual(
        self.human_readable_tokenizer.human_readable_tokenizer(stack_trace),
        stack_trace_tokens)

  def test_stack_trace_line_tokenizer(self):
    """Test suite for stack_trace_line_tokenizer."""
    # example stack trace we would want to extract from
    sample_stack_trace = open(
        'testdata/tokenizer/sample_stack_trace.txt').read()
    sample_extracted_lines = [
        'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.lambda$getMovementCode$0',
        'java.util.Optional.orElseThrow',
        'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.getMovementCode',
        'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.createRevenueMovement',
        'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.addLineItem',
        'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.addAllLineItems',
        'com.google.moneta.purchaseorder.service.purchaseorder.purchaseorderinternal.ChargeAction.charge'
    ]
    self.assertEqual(
        self.stack_trace_tokenizer.stack_trace_line_tokenizer(
            sample_stack_trace), sample_extracted_lines)

  def test_token_ignore(self):
    """Test suite to test functionality of ignoring specific tokens."""
    sample_string = 'this is useful info, but this is uselessInfo'
    sample_tokens = ['this', 'is', 'useful', 'info', 'but', 'this', 'is']
    self.assertEqual(
        self.ignore_test_config.human_readable_tokenizer(sample_string),
        sample_tokens)
Пример #34
0
def execute_system(p_datafilesdir,
                   p_tokenizefiles=False,
                   p_createindex=True,
                   p_createsnippets=False):

    print "Executing Task 3A..."
    startTime = time.time()

    ldatafilesdir = CACM_DATA
    if is_string_valid(p_datafilesdir):
        ldatafilesdir = p_datafilesdir

    # Variable for no of documents
    NoOfDocuments = get_no_of_files_in_dir(ldatafilesdir)

    # Create output directory
    create_directory(DIR_FOR_OUTPUT_FILES)
    ldirpathfortask = DIR_FOR_OUTPUT_FILES + "/" + TASK3A_CONST
    create_directory(ldirpathfortask)

    ltokenizedfilesdir = DIR_FOR_OUTPUT_FILES + "/" + DIR_FOR_TOKENIZED_FILES
    if p_tokenizefiles:
        t1 = time.time()
        # create tokenizer and generate token documents
        ltokenizer = Tokenizer()
        ltokenizer.setTokenizedFilesOutputDir(ltokenizedfilesdir)
        ltokenizer.tokenizedir(ldatafilesdir)
        t2 = time.time()
        # print "Time for Tokenizer Module: " + str(t2-t1)

    if p_createindex:
        t1 = time.time()
        # create instance of Indexer class and create indexes
        lindexer = Indexer()
        lindexer.set_tokenized_files_dir(ltokenizedfilesdir)
        lindexer.setOutputDirectory(TASK3A_CONST)
        lindexer.startIndexing(True, False)
        lindexer.printAll()
        t2 = time.time()
        # print "Time for Indexer Module: " + str(t2-t1)

    # Convert query list to query dict
    lquerydict = get_given_queries_in_dict(CACM_QUERY_FILE + FILE_EXT)
    lquerydict = get_sorted_dict(lquerydict)

    t1 = time.time()
    lrm = RMBase()
    # Set the no. of documents with the retrieval module
    lindexfilename = ldirpathfortask + "/" + FILE_FOR_STOPPED_INDEX + FILE_EXT
    ldocfreqtableforunigramfilename = ldirpathfortask + "/" + FILE_FOR_DOC_FREQ_TABLE + CONSTS_FOR_UNIGRAM + FILE_EXT
    lwordcountsbyfilefilename = ldirpathfortask + "/" + FILE_FOR_WORD_COUNTS_BY_FILE_FOR + FILE_FOR_STOPPED_INDEX + FILE_EXT

    lrm.setNoOfDocuments(NoOfDocuments)
    lrm.setOutputDirectory(TASK3A_CONST)
    lrm.setCanUseRelevanceInfo(True)
    lrm.setIndexFileName(lindexfilename)
    lrm.setDocFreqDictFileName(ldocfreqtableforunigramfilename)
    lrm.setWordCountsByFileDictFileName(lwordcountsbyfilefilename)

    lrm.initializeRM()
    # Process all the queries for the retrieval module
    lrm.processQueriesFromFile(lquerydict, True)
    t2 = time.time()
    # print "Time for Retrieval Module: " + str(t2-t1)

    endTime = time.time()
    print "Task 3A execution completed in " + str(endTime - startTime)

    if p_createsnippets:
        # Generate snippets for BM25 output
        generate_snippet(CACM_QUERY_FILE + FILE_EXT,
                         ldirpathfortask + "/" + DIR_FOR_BM25_OUTPUT,
                         ldirpathfortask + "/" + SNIPPET_GEN_RESULTS_FOLDER + CONST_FOR_BM25)

        # Generate snippets for TF-IDF output
        generate_snippet(CACM_QUERY_FILE + FILE_EXT,
                         ldirpathfortask + "/" + DIR_FOR_TFIDF_OUTPUT,
                         ldirpathfortask + "/" + SNIPPET_GEN_RESULTS_FOLDER + CONST_FOR_TFIDF)
Пример #35
0
        acc = average_precision_score(y_test, y_)
        rec = recall_score(y_test, y_)
        f1 = f1_score(y_test, y_)
        print('----evaluation done----')
        return acc, rec, f1


if __name__ == '__main__':
    # 分好词并去掉停止词的正面与负面评价,各5000条
    with open('words_pos') as words_pos, open('words_neg') as words_neg:
        pos, neg = words_pos.readlines(), words_neg.readlines()

        stopwords = [
            word.strip() for word in open('chinese_stopwords.txt').readlines()
        ]
        tokenizer = Tokenizer(stopwords)
        # 封装的朴素贝叶斯分类器,接受参数为用来生成词汇表的语料
        # 这里将训练与测试数据一起用来生成词汇表,防止测试集出现词汇表外的单词的情况
        naive_bayes = NaiveBayes(tokenizer, pos + neg)

        # 测试集保留正面负面评价各500条
        x_test = pos[-500:] + neg[-500:]
        y_test = [1] * 500 + [0] * 500

        # 其余全作为训练集
        x_train = pos[:4500] + neg[:4500]
        y_train = [1] * 4500 + [0] * 4500
        naive_bayes.fit(x_train, y_train)

        # 三折交叉验证,但是因为训练数据只有4500条,分为三折,训练集只分配到3000条,降低了最后在测试集的表现,所以最终决定不用三折交叉验证
        # P, R, F1 = [], [], []
Пример #36
0
 def tokenize(self):
     tokenizer = Tokenizer()
     self.tokens = tokenizer.tokenize(self.source)
Пример #37
0
    def fit(self, X_train, y_train, X_val=None, y_val=None):
        if X_val is None or y_val is None:
            pass
        self._tokenizer = Tokenizer(mindf=self.mindf,
                                    lan=self.lan,
                                    stopwordsSet=self.stopwords,
                                    model='sample',
                                    k=self.k,
                                    verbose=self._verbose)
        self._tokenizer.fit(X_train, y_train)

        self.maxF = int(round(np.log2(self._tokenizer.maxF + 1)))

        self._model = AttentionTFIDF(vocab_size=self._tokenizer.vocab_size,
                                     hiddens=self.hiddens,
                                     nclass=self._tokenizer.n_class,
                                     maxF=self.maxF,
                                     drop=self.max_drop).to(self._device)

        optimizer = optim.AdamW(self._model.parameters(),
                                lr=self.lr,
                                weight_decay=self.weight_decay)
        loss_func_cel = nn.CrossEntropyLoss().to(self._device)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=self.factor,
            patience=3,
            verbose=self._verbose)

        best = 99999.
        best_acc = 0.
        counter = 1
        dl_val = DataLoader(list(zip(X_val, y_val)),
                            batch_size=self.batch_size,
                            shuffle=False,
                            collate_fn=self.collate_train,
                            num_workers=self.n_jobs)

        for e in tqdm(range(self.nepochs),
                      total=self.nepochs,
                      disable=not self._verbose):
            dl_train = DataLoader(list(zip(X_train, y_train)),
                                  batch_size=self.batch_size,
                                  shuffle=True,
                                  collate_fn=self.collate_train,
                                  num_workers=self.n_jobs)
            loss_train = 0.
            with tqdm(total=len(y_train) + len(y_val),
                      smoothing=0.,
                      desc=f"ACC_val: {best_acc:.2} Epoch {e+1}",
                      disable=not self._verbose) as pbar:
                total = 0
                correct = 0
                self._model.train()
                self._tokenizer.model = 'sample'
                for i, (doc_tids, TFs, DFs, y) in enumerate(dl_train):

                    doc_tids = doc_tids.to(self._device)
                    TFs = TFs.to(self._device)
                    DFs = DFs.to(self._device)
                    y = y.to(self._device)

                    pred_docs, _, _ = self._model(doc_tids, TFs, DFs)
                    pred_docs = torch.softmax(pred_docs, dim=1)
                    loss = loss_func_cel(pred_docs, y)

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    loss_train += loss.item()
                    total += len(y)
                    y_pred = pred_docs.argmax(axis=1)
                    correct += (y_pred == y).sum().item()
                    self._model.drop_ = (correct / total) * self.max_drop

                    pbar.update(len(y))
                    del doc_tids, TFs
                    del DFs, y, pred_docs
                    del loss, y_pred
                loss_train = loss_train / (i + 1)
                total = 0
                correct = 0
                self._model.eval()
                self._tokenizer.model = 'topk'
                with torch.no_grad():
                    loss_val = 0.
                    for i, (doc_tids, TFs, DFs, y) in enumerate(dl_val):
                        doc_tids = doc_tids.to(self._device)
                        TFs = TFs.to(self._device)
                        DFs = DFs.to(self._device)
                        y = y.to(self._device)

                        pred_docs, _, _ = self._model(doc_tids, TFs, DFs)
                        pred_docs = torch.softmax(pred_docs, dim=1)
                        loss = loss_func_cel(pred_docs, y)

                        loss_val += loss.item()
                        total += len(y)
                        y_pred = pred_docs.argmax(axis=1)
                        correct += (y_pred == y).sum().item()
                        pbar.update(len(y))
                        loss_val
                        del doc_tids, TFs, DFs, y
                        del pred_docs, loss
                    loss_val = (loss_val / (i + 1))
                    scheduler.step(loss_val)

                    if best - loss_val > 0.0001:
                        best = loss_val
                        counter = 1
                        best_acc = correct / total
                        best_model = copy.deepcopy(self._model).to('cpu')
                    elif counter > self.patience:
                        break
                    else:
                        counter += 1

        self._model = best_model.to(self._device)

        self._loss = best
        self._acc = best_acc

        return self
Пример #38
0
    save_path = "output.csv"
    # initialize objects
    print('Initializing objects ...')
    print('Initializing word embeddings ...')
    t1 = time.time()
    # /media/reza/book/dataset/word2vec/GoogleNews-vectors-negative300.bin
    # word_embeddings = WordEmbeddings("/media/reza/book/dataset/word2vec/GoogleNews-vectors-negative300.bin")
    word_embeddings = loadWordModel("E:\\dataset\\glove\\glove.6B.50d.txt")
    # /media/reza/book/Py_Projects/Lample2016-tagger-master/model_tag2vec.txt
    pos_embeddings = loadWordModel(
        "E:\\Py_Projects\\Lample2016-tagger-master\\model_tag2vec.txt")

    t2 = time.time()
    print('\tTook %f seconds' % (t2 - t1))
    print('Initializing tokenizer ...')
    tokenizer = Tokenizer()
    print('Initializing vectorizer ...')
    vectorizer = Vectorizer(word_embeddings, tokenizer)
    vectorizer_pos = VectorizerPosTags(pos_embeddings)

    #### training dataset ####
    # vectorizing
    ids, train_a_vectors, train_b_vectors, train_gold = vectorizer.vectorize_df(
        train_df)
    train_a_pos_vectors, train_b_pos_vectors = vectorizer_pos.vectorize_sentence_pos_df(
        train_df)

    train_max_a_length = len(max(train_a_vectors, key=len))
    train_max_b_length = len(max(train_b_vectors, key=len))
    print('maximum number of tokens per sentence A in training set is %d' %
          train_max_a_length)
Пример #39
0
def convertToLaTeX(string):
    string = transform_environment(string)
    tokenizer = Tokenizer(scanner=Scanner(string))
    parser = Parser(tokenizer=tokenizer)
    res = str(parser.parseCode())
    return res
from tokenizer import Tokenizer
from scorer import WordOverlappingScorer, EmbeddingBasedScorer

if __name__ == "__main__":
    # Build segmentor provided
    print("Building tokenizer")
    tokenizer = Tokenizer()

    # Process sentence pairs
    hyps = [
        "映画を見ますか", "映画はどんなのを見ますか", "映画はどれくらい見ますか", "ゴルフは見ますか", "サッカーは見ますか",
        "ゴルフで好きな選手はいますか"
    ]
    refs = ["オリンピックは見ますか"] * len(hyps)

    hyps = [tokenizer.tokenize(sent) for sent in hyps]
    refs = [tokenizer.tokenize(sent) for sent in refs]

    # You can pass `vocab` as an argument to EmbeddingBasedScorer() to load in-vocab words only for accelerating embedding loading.
    # If `vocab` is None (by default), all the embeddings will be loaded, which will take a longer time.
    vocab = set()
    for sent in hyps + refs:
        for token in sent:
            vocab.add(token)

    # Build scorers
    print("Building scorer")
    word_overlap_scorer = WordOverlappingScorer()
    embedding_based_scorer = EmbeddingBasedScorer(vocab=vocab)

    # Calculate similarities between sentence pairs
Пример #41
0
def _encode(t : Tokenizer, e : 'TermEncoder', s : str) -> EncodedTerm:
    return e(t.toTokenList(s))
Пример #42
0
from tokenizer import Tokenizer
from parser2 import Parser

source_code = ''

with open('examples/main.stp') as stp:
    source_code = stp.read(1024)

tokenizer = Tokenizer(source_code, True)
parser = Parser(tokenizer)

syntax_tree = parser.parse()
if parser.current_level != 0:
    raise Exception('brackets error')

print(syntax_tree)
import os
from tokenizer import Tokenizer

#directory where the C++ files are
dirname = "C++/"

#directories inside the C++ directory
for f in os.listdir(dirname):
    dirnameone = dirname + f + "/"

    dirlist = os.listdir(dirnameone)

    # individual files
    for indfiles in dirlist:
        #complete file path used for tokenization
        indtoken = dirnameone + indfiles

        tok = Tokenizer(indtoken)
        entire_token_stream = tok.full_tokenize()

        print(entire_token_stream)
Пример #44
0
class LMFluencyFilter:
    def __init__(self, lm_type: LMType, language: str, tokenizer_command):
        """
            lm_type: LMType
            language: language code
            tokenizer_command: tokenizer full command (with flags if needed)
        """

        self.language = language
        self.tokenizer = Tokenizer(tokenizer_command, self.language)
        self.normalizer = MosesPunctNormalizer(lang=self.language)
        self.type = lm_type

    @classmethod
    def _ispunctuation(cls, t):
        return all(not c.isalnum() for c in t)

    @classmethod
    def _replace_placeholder(cls, t):
        if t.isalpha():
            unicodeGroup = UnicodeWordClassifier.classify_word(t)
            if t.islower():
                return "TOKEN:ALPHA:LOWER:" + unicodeGroup
            elif t.istitle():
                return "TOKEN:ALPHA:TITLE:" + unicodeGroup
            elif t.isupper():
                return "TOKEN:ALPHA:UPPER:" + unicodeGroup
            else:
                return "TOKEN:ALPHA:MIXED:" + unicodeGroup
        else:
            if t.isnumeric():
                return "TOKEN:NUMERIC"
            elif cls._ispunctuation(t):
                return t
            else:
                return "TOKEN:MIXED"

    @classmethod
    def _estimate_kenlm(cls, corpus: str, lm_file: str, params: str):
        output = subprocess.run("lmplz " + params + " < " + corpus + " > " +
                                lm_file + ".arpa",
                                shell=True,
                                stderr=PIPE,
                                stdout=PIPE)
        logging.debug(output.stderr.decode())
        logging.debug(output.stdout.decode())
        output = subprocess.run("build_binary " + lm_file + ".arpa " + lm_file,
                                shell=True,
                                stderr=PIPE,
                                stdout=PIPE)
        logging.debug(output.stderr.decode())
        logging.debug(output.stdout.decode())

    def load_lm(self, lm_path: str):
        self.lm_path = lm_path
        self.lm = kenlm.LanguageModel(self.lm_path)


#    def _sentence_split(self,sentence:str):
#        return self.splitter([sentence])

    def _tokenize(self, sentence):
        sentence = self.normalizer.normalize(sentence)

        if self.type != LMType.CHARACTER:
            tokline = " ".join(self.tokenizer.tokenize(sentence))
        else:
            tokline = " ".join(["SPACE" if c == " " else c for c in sentence])
        return tokline

    def _introduce_placeholders(self, sentence):
        if self.type != LMType.PLACEHOLDER:
            return sentence
        else:
            toks = self._replace_placeholder(sentence)
            return " ".join(toks)

    def train_lm(self, text_path: str):
        tokenized_f = NamedTemporaryFile("w", delete=False)
        placeholderized_f = NamedTemporaryFile("w", delete=False)

        #Tokenize text
        with open(text_path) as input_f:
            for line in input_f:
                #line=line.rstrip("\n")
                tokline = self._tokenize(line)
                tokenized_f.write(tokline)
                tokenized_f.write("\n")
        tokenized_f.close()

        #Perform placeholder replacement if needed
        with open(tokenized_f.name) as tokenized_ff:
            for line in tokenized_ff:
                line = line.rstrip("\n")
                with_placeholders = self._introduce_placeholders(line)
                logging.debug(
                    "Processed training example: {}".format(with_placeholders))
                placeholderized_f.write(with_placeholders)
                placeholderized_f.write("\n")
        placeholderized_f.close()

        #Estimate LM
        lm_file = NamedTemporaryFile(delete=False)
        lm_file.close()

        if self.type == LMType.CHARACTER:
            params = "-o 7 --discount_fallback"
        else:
            params = "-o 7 --discount_fallback"

        self._estimate_kenlm(placeholderized_f.name, lm_file.name, params)
        self.lm_path = lm_file.name

        self.lm = kenlm.LanguageModel(self.lm_path)

        #Remove temporary files
        os.remove(tokenized_f.name)
        os.remove(placeholderized_f.name)

    def copy_lm(self, dst: str):
        shutil.copyfile(self.lm_path, dst)

    def cleanup(self):
        os.remove(self.lm_path)

    def _raw_score(self, sentence: str):
        return self.lm.score(sentence)

    @classmethod
    def estimate_threshold(cls, filter_a, filter_b, dev_corpus_a: str,
                           dev_corpus_b: str):
        scores = []
        with open(dev_corpus_a) as corpus_a_f, open(
                dev_corpus_b) as corpus_b_f:
            for linea, lineb in zip(corpus_a_f, corpus_b_f):
                linea = linea.rstrip("\n")
                lineb = lineb.rstrip("\n")
                scores.append(filter_a.score(linea) + filter_b.score(lineb))
        return numpy.mean(scores), numpy.std(scores)

    def score(self, sentence: str):
        #We need to preprocess the sentence in the same way as when training the LM
        #sents= self._sentence_split(sentence)
        #processed_sents=[self._introduce_placeholders(self._tokenize(s)) for s in sents]
        processed_sent = self._introduce_placeholders(self._tokenize(sentence))
        logging.debug("Scoring: {}".format(processed_sent))

        raw_score = self._raw_score(processed_sent)

        #Normalize score
        #return sum(raw_scores)/(sum([len(s.split()) for s in processed_sents]) + len(processed_sents) ) # We divide by total number of tokens + 1 for each sentence (taken from kenlm perplexity method)
        return raw_score / (sum([len(processed_sent.split())]) + 1
                            )  #the same, but assuming only 1 sentence
Пример #45
0
    writer = tf.io.TFRecordWriter(output_file)
    prev_text_a = None
    query_id = -1  # assuming continguous examples for same text_a.
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" %
                            (ex_index, len(examples)))
        tf_example = create_tf_example(example, tokenizer)
        writer.write(tf_example.SerializeToString())
    writer.close()
    tf.logging.info("Done write tfrecords to %s" % output_file)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', type=str)
    parser.add_argument('--output_file', type=str)
    parser.add_argument('--idx_text', type=int, default=1)
    parser.add_argument('--idx_label', type=int, default=6)
    args, _ = parser.parse_known_args()
    input_file = args.input_file
    output_file = args.output_file
    idx_text = args.idx_text
    idx_label = args.idx_label

    train_examples = get_train_examples(input_file, idx_text, idx_label)
    tf.logging.info("Number of train examples is %d" % len(train_examples))
    tokenizer = Tokenizer(Config.vocab_file)
    if not os.path.exists(output_file):
        file_based_convert_examples_to_tfrecord(train_examples, tokenizer,
                                                output_file)
Пример #46
0
                   seed_everything, BalancedDataLoader,
                   make_train_data_from_txt, make_itf)

logging.basicConfig(level=logging.INFO)

if __name__ == '__main__':
    logging.info('*** Initializing ***')

    if not os.path.isdir(Config.data_dir):
        os.mkdir(Config.data_dir)

    seed_everything(Config.seed)
    device = torch.device(Config.device)

    start_epoch = 0
    tokenizer = Tokenizer.from_pretrained(Config.model_name)

    logging.info('Preparing training data')
    if Config.use_pickle:
        with open(f'{Config.pickle_path}', 'rb') as f:
            train_data = pickle.load(f)
    else:
        train_data = make_train_data_from_txt(Config, tokenizer)
    itf = make_itf(train_data, Config.vocab_size)
    dataset = DialogDataset(train_data, tokenizer)

    logging.info('Define Models')
    model = build_model(Config).to(device)
    state_dict = torch.load(f'{Config.data_dir}/{Config.fn}.pth')
    model.load_state_dict(state_dict['model'])
    model.unfreeze()
Пример #47
0
__author__ = 'Levon'
from interpreter import interpreter
from tokenizer import Tokenizer
from tree import parseTree

pT = parseTree()
tok = Tokenizer()
interp = interpreter()
Tree = pT.buildParseTree(tok.tokenize("1+2"))
assert (interp.evaluate(Tree) == 3)
Tree = pT.buildParseTree(tok.tokenize("(5+(2*3+2))-3*((5+6)/2-4)"))
assert (interp.evaluate(Tree) == 8.5)
Tree = pT.buildParseTree(tok.tokenize("x = 2"))
assert (interp.evaluate(Tree) == 2)
Tree = pT.buildParseTree(tok.tokenize("y = 4^3"))
assert (interp.evaluate(Tree) == 64)
Tree = pT.buildParseTree(tok.tokenize("y^x*2-3"))
assert (interp.evaluate(Tree) == 8189)
Tree = pT.buildParseTree(tok.tokenize("(x+(2*y+2))-y*((5+x)/2-4)"))
assert (interp.evaluate(Tree) == 164)
Tree = pT.buildParseTree(tok.tokenize("sin(10)"))
assert (interp.evaluate(Tree) == -0.5440211108893698)
Tree = pT.buildParseTree(tok.tokenize("2^(5+1)"))
assert (interp.evaluate(Tree) == 64)
Tree = pT.buildParseTree(tok.tokenize("(2+1)^(2+1)"))
assert (interp.evaluate(Tree) == 27)
Пример #48
0
class Parser():
    def __init__(self, origin):
        self.tokens = Tokenizer(origin)
        self.tokens.selectNext()

    def parseProgram(self):
        token = self.tokens.actual
        if token.type == "program":
            token = self.tokens.selectNext()
            if token.type == "IDE":
                name_program = token.value
                token = self.tokens.selectNext()
                if token.type == "SEMI_COLON":
                    self.tokens.selectNext()
                    variables = self.parseVariables()
                    functions = self.parseFunctions()
                    statements = self.parseStatements()
                    result = Program(name_program,
                                     [variables, functions, statements])
                    token = self.tokens.actual
                    if token.type == "END_PROGRAM":
                        pass
                    else:
                        raise ValueError(
                            "Invalid token, expecting a . on position \
                                         {}".format(self.tokens.position))
                else:
                    raise ValueError("Invalid token, expecting a semi colon \
                        or a end on position {}".format(self.tokens.position))
        else:
            raise ValueError("Invalid token, expecting a program on \
                             position {}".format(self.tokens.position))
        return result

    def parseFunctionCall(self):
        pass

    def parseFunctions(self):
        token = self.tokens.actual
        result = Funcs(None, [])
        while True:
            if token.type == "function":
                token = self.tokens.selectNext()
                if token.type == "IDE":
                    function_name = token.value
                    func = FuncDec(function_name, [])
                    self.tokens.selectNext()
                    arguments = self.parseArgumentsFunction(function_name)
                    self.tokens.selectNext()
                    variables = self.parseVariables()
                    functions = self.parseFunctions()
                    statements = self.parseStatements()
                    func.children.append(arguments)
                    func.children.append(variables)
                    func.children.append(functions)
                    func.children.append(statements)
                    result.children.append(func)
                    token = self.tokens.actual
                else:
                    raise ValueError(
                        "Invalid token, expecting a identifier on position \
                                         {}".format(self.tokens.position))
            elif token.type == "begin":
                return result
            else:
                raise ValueError(
                    "Invalid token, expecting a function on position \
                                     {}".format(self.tokens.position))

    def parseArgumentsFunction(self, function_name):
        token = self.tokens.actual
        if token.type == "OPEN_PAR":
            list_arguments = []
            while True:
                token = self.tokens.selectNext()
                if token.type == "IDE":
                    list_arguments.append(token.value)
                    token = self.tokens.selectNext()
                    if token.type == "VAR_DECLARATION":
                        break
                    elif token.type == "COMMA":
                        pass
                    else:
                        raise ValueError(
                            "Invalid token, expecting a : or , on position \
                             {}".format(self.tokens.position))
                else:
                    raise ValueError(
                        "Invalid token, expecting a identifier on position \
                             {}".format(self.tokens.position))
            token = self.tokens.selectNext()
            if token.type == "TYPE":
                arguments = VarDec(None, [])
                for var_name in list_arguments:
                    var_name = StrVal(var_name, [])
                    value = StrVal(token.value, [])
                    variable = BinOp(":", [var_name, value])
                    arguments.children.append(variable)
                token = self.tokens.selectNext()
                if token.type == "CLOSE_PAR":
                    token = self.tokens.selectNext()
                    if token.type == "VAR_DECLARATION":
                        token = self.tokens.selectNext()
                        if token.type == "TYPE":
                            return_var_name = StrVal(function_name, [])
                            return_type = StrVal(token.value, [])
                            variable = BinOp(":",
                                             [return_var_name, return_type])
                            arguments.children.append(variable)
                            token = self.tokens.selectNext()
                            if token.type == "SEMI_COLON":
                                return arguments
                            else:
                                raise ValueError(
                                    "Invalid token, expecting a ; on position \
                             {}".format(self.tokens.position))
                        else:
                            raise ValueError(
                                "Invalid token, expecting a type on position \
                             {}".format(self.tokens.position))
                    else:
                        raise ValueError(
                            "Invalid token, expecting a : on position \
                             {}".format(self.tokens.position))
                else:
                    raise ValueError(
                        "Invalid token, expecting a ) on position \
                             {}".format(self.tokens.position))
            else:
                raise ValueError("Invalid token, expecting a type on position \
                             {}".format(self.tokens.position))
        else:
            raise ValueError("Invalid token, expecting a ( on position \
                             {}".format(self.tokens.position))

    def parseVariables(self):
        token = self.tokens.actual
        result = VarDec(None, [])
        if token.type != "begin":
            if token.type == "var":
                token = self.tokens.selectNext()
                while True:
                    list_vars = []
                    while True:
                        if token.type == "IDE":
                            list_vars.append(token.value)
                            token = self.tokens.selectNext()
                            if token.type == "COMMA":
                                token = self.tokens.selectNext()
                            elif token.type == "VAR_DECLARATION":
                                break
                            else:
                                raise ValueError(
                                    "Invalid token, expecting a , or : on position \
                                     {}".format(self.tokens.position))
                        else:
                            raise ValueError(
                                "Invalid token, expecting a identifier on position \
                                     {}".format(self.tokens.position))
                    token = self.tokens.selectNext()
                    if token.type == "TYPE":
                        for var_name in list_vars:
                            var_name = StrVal(var_name, [])
                            value = StrVal(token.value, [])
                            variable = BinOp(":", [var_name, value])
                            result.children.append(variable)
                        token = self.tokens.selectNext()
                        if token.type == "SEMI_COLON":
                            token = self.tokens.selectNext()
                            if token.type == "begin":
                                break
                            elif token.type == "function":
                                break
                            elif token.type == "IDE":
                                pass
                            else:
                                raise ValueError(
                                    "Invalid token, expecting a begin \
                                                 or identifier on position {}".
                                    format(self.tokens.position))
                        else:
                            raise ValueError(
                                "Invalid token, expecting a ; on position \
                                             {}".format(self.tokens.position))
                    else:
                        raise ValueError(
                            "Invalid token, expecting a type on position \
                                 {}".format(self.tokens.position))
            else:
                raise ValueError("Invalid token, expecting a var on position \
                                 {}".format(self.tokens.position))
        return result

    def parseStatements(self):
        token = self.tokens.actual
        if token.type == "begin":
            result = Statements(None, [])
            while True:
                self.tokens.selectNext()
                result.children.append(self.parseStatement())
                token = self.tokens.actual
                if token.type == "SEMI_COLON":
                    pass
                elif token.type == "end":
                    break
            if self.tokens.actual.type == "end":
                self.tokens.selectNext()
                pass
            else:
                raise ValueError("Invalid token, expecting a end on \
                                 position {}".format(self.tokens.position))
        else:
            raise ValueError("Invalid token, expecting a begin on \
                                 position {}".format(self.tokens.position))
        return result

    def parseStatement(self):
        token = self.tokens.actual
        if token.type == "begin":
            result = self.parseStatements()
        elif token.type == "IDE":
            result = self.parseAtribution()
        elif token.type == "print":
            result = self.parsePrint()
        elif token.type == "if":
            result = self.parseIf()
        elif token.type == "while":
            result = self.parseWhile()
        else:
            raise ValueError(
                "Invalid token, expecting a begin,identifier, print, if or while \
                             on position {}".format(self.tokens.position))
        return result

    def parseAtribution(self):
        value1 = StrVal(self.tokens.actual.value, [])
        token = self.tokens.selectNext()
        if (token.type == "ATRIBUTE"):
            token = self.tokens.selectNext()
            if (token.type == "read"):
                value2 = self.parseRead()
            else:
                value2 = self.parseExpression()
            result = BinOp(":=", [value1, value2])
        else:
            raise ValueError(
                "Invalid token, expecting a := on position {}".format(
                    self.tokens.position))
        return result

    def parsePrint(self):
        token = self.tokens.selectNext()
        if token.type == "OPEN_PAR":
            self.tokens.selectNext()
            value = self.parseExpression()
            token = self.tokens.actual
            if token.type == "CLOSE_PAR":
                result = Print(value, [value])
                self.tokens.selectNext()
            else:
                raise ValueError(
                    "Invalid token, expecting a ) on position {}".format(
                        self.tokens.position))
        else:
            raise ValueError(
                "Invalid token, expecting a ( on position {}".format(
                    self.tokens.position))
        return result

    def parseRelExpression(self):
        self.tokens.selectNext()
        value1 = self.parseExpression()
        token = self.tokens.actual
        if token.type == 'COMP':
            self.tokens.selectNext()
            value2 = self.parseExpression()
            result = BinOp(token.value, [value1, value2])
        else:
            raise ValueError("Invalid token, expecting a <, >, = or != \
                             on position {}".format(self.tokens.position))
        return result

    def parseIf(self):
        comp = self.parseRelExpression()
        token = self.tokens.actual
        if (token.type == "then"):
            self.tokens.selectNext()
            statement1 = self.parseStatement()
            token = self.tokens.actual
            if (token.type == "else"):
                self.tokens.selectNext()
                statement2 = self.parseStatement()
            else:
                statement2 = NoOp(None, [])
            result = If(None, [comp, statement1, statement2])
        else:
            raise ValueError("Invalid token, expecting a then on \
                             position {}".format(self.tokens.position))
        return result

    def parseRead(self):
        token = self.tokens.selectNext()
        if token.type == "OPEN_PAR":
            self.tokens.selectNext()
            token = self.tokens.actual
            if token.type == "CLOSE_PAR":
                result = Read(None, [])
                self.tokens.selectNext()
            else:
                raise ValueError(
                    "Invalid token, expecting a ) on position {}".format(
                        self.tokens.position))
        else:
            raise ValueError(
                "Invalid token, expecting a ( on position {}".format(
                    self.tokens.position))
        return result

    def parseWhile(self):
        comp = self.parseRelExpression()
        token = self.tokens.actual
        if (token.type == "then"):
            self.tokens.selectNext()
            statement1 = self.parseStatement()
            token = self.tokens.actual
            result = While(None, [comp, statement1])
        else:
            raise ValueError("Invalid token, expecting a then on \
                             position {}".format(self.tokens.position))
        return result

    def parseExpression(self):
        result = self.parseTerm()
        while True:
            token = self.tokens.actual
            if token is None:
                break
            if token.type == "PLUS":
                self.tokens.selectNext()
                second_value = self.parseTerm()
                result = BinOp("+", [result, second_value])
            elif token.type == "MINUS":
                self.tokens.selectNext()
                second_value = self.parseTerm()
                result = BinOp("-", [result, second_value])
            elif token.type == "or":
                self.tokens.selectNext()
                second_value = self.parseTerm()
                result = BinOp("or", [result, second_value])
            else:
                break
        return result

    def parseTerm(self):
        result = self.parseFactor()
        while True:
            token = self.tokens.actual
            if token is None:
                break
            elif token.type == "MULT":
                self.tokens.selectNext()
                second_value = self.parseFactor()
                result = BinOp("*", [result, second_value])
            elif token.type == "DIV":
                self.tokens.selectNext()
                second_value = self.parseFactor()
                result = BinOp("/", [result, second_value])
            elif token.type == "and":
                self.tokens.selectNext()
                second_value = self.parseFactor()
                result = BinOp("and", [result, second_value])
            else:
                break
        return result

    def parseFactor(self):
        token = self.tokens.actual
        if token is None:
            raise ValueError(
                "Invalid token, expecting a number or opening parentesis on \
                position {}, got NULL".format(self.tokens.position))
        if token.type == "int":
            result = IntVal(token.value, [])
            self.tokens.selectNext()
        elif token.type == "boolean":
            result = BoolVal(token.value, [])
            self.tokens.selectNext()
        elif token.type == "OPEN_PAR":
            self.tokens.selectNext()
            result = self.parseExpression()
            token = self.tokens.actual
            if token.type != "CLOSE_PAR":
                raise ValueError("Invalid token, missing parentesis close on \
                    position {}".format(self.tokens.position))
        elif token.type == "MINUS":
            self.tokens.selectNext()
            result = self.parseFactor()
            result = UnOp("-", [result])
        elif token.type == "not":
            self.tokens.selectNext()
            result = self.parseFactor()
            result = UnOp("not", [result])
        elif token.type == "PLUS":
            self.tokens.selectNext()
            result = self.parseFactor()
        elif token.type == "IDE":
            identifier = token.value
            token = self.tokens.selectNext()
            if token.type == "OPEN_PAR":
                token = self.tokens.selectNext()
                args = []
                while True:
                    if token.type == "CLOSE_PAR":
                        break
                    else:
                        arg = self.parseExpression()
                        args.append(arg)
                        token = self.tokens.actual
                        if token.type == "COMMA":
                            self.tokens.selectNext()
                            pass
                        elif token.type == "CLOSE_PAR":
                            break
                        else:
                            raise ValueError(
                                "Invalid token, expecting a , or ) on \
                                    position {}".format(self.tokens.position))
                none_value = IntVal(None, [])
                args.append(none_value)
                result = FuncCall(identifier, args)
                self.tokens.selectNext()
            else:
                result = Identifier(identifier, [])
        else:
            raise ValueError(
                "Invalid token, expecting number or opening parentesis on \
                position {}".format(self.tokens.position))
        return result
Пример #49
0
class Transformer(chainer.Chain):
    def __init__(self, config):
        self.config = config
        self.label_smoothing = config.label_smoothing
        self.position_encoding = self._init_position_encoding(
            config.max_length, config.unit_num)
        self.tokenizer = Tokenizer(config.tokenizer_dir, config.dict_dir,
                                   config.augmentation)

        frequency = []
        with open(config.freq_dir) as f:
            for line in f:
                line = line.rstrip()
                frequency.append(line)
        self.itf = 1 / (np.array(frequency, dtype=np.float32) +
                        1)**config.itf_lambda

        super(Transformer, self).__init__()
        with self.init_scope():
            self.source_embed = L.EmbedID(config.vocab_size,
                                          config.unit_num,
                                          ignore_label=config.pad_id)
            self.enc = Encoder(config)
            self.target_embed = L.EmbedID(config.vocab_size,
                                          config.unit_num,
                                          ignore_label=config.pad_id)
            self.dec = Decoder(config)

    def forward(self, x_s, x_t, translate=False):
        """
            args
                x_s: array of padded source sentences.
                x_t: array of padded target sentences.
                translate: whether this function used for translate or not.
            returns
                dec_out: encoder-decoder model's output.
                enc_out: encoder's output used for translation.
        """
        length_s, length_t = x_s.shape[1], x_t.shape[1]
        h_s = self.source_embed(x_s)
        h_t = self.target_embed(x_t)
        h_s += self.xp.array(self.position_encoding[None, :length_s])
        h_t += self.xp.array(self.position_encoding[None, :length_t])
        h_s = F.transpose(h_s, (0, 2, 1))
        h_t = F.transpose(h_t, (0, 2, 1))

        src_self_mask = self._get_padding_mask(x_s, x_s, self.config.pad_id)
        tgt_self_mask = self._get_padding_mask(x_t, x_t, self.config.pad_id)
        tgt_future_mask = self._get_future_mask(x_t)
        tgt_self_mask *= tgt_future_mask
        src_tgt_mask = self._get_padding_mask(x_s, x_t, self.config.pad_id)

        enc_out = self.enc(h_s, src_self_mask)
        dec_out = self.dec(h_t, enc_out, tgt_self_mask, src_tgt_mask)

        B, D, L = dec_out.shape
        dec_out = F.transpose(dec_out, (0, 2, 1)).reshape(B * L, D)
        dec_out = F.linear(dec_out, self.target_embed.W)

        if translate:
            return dec_out, enc_out
        else:
            return dec_out

    def __call__(self, x_s, x_t):
        """
            args
                x_s: list of source sentences
                    ["こんにちは", "あああああ", ...]
                x_t: list of target sentence
                    ["こんにちは", "アババババ", ...]
            returns
                loss: calculated loss (Variable)
        """
        x_s = self.tokenizer.tokenize_sentences(x_s)
        x_t = self.tokenizer.tokenize_sentences(x_t)
        x_s = self._get_padded_sentence(x_s, pad_id=self.config.pad_id)
        x_t = self._get_padded_sentence(x_t,
                                        pad_id=self.config.pad_id,
                                        eos_id=self.config.eos_id)

        batch_t, length_t = x_t.shape
        y_t = copy.deepcopy(x_t).reshape((batch_t * length_t))
        bos_ids = self.xp.repeat(self.xp.array([self.config.bos_id],
                                               dtype=np.int32),
                                 batch_t,
                                 axis=0)[..., None]
        x_t = self.xp.concatenate([bos_ids, x_t[:, :length_t - 1]], axis=1)

        y_pred = self.forward(x_s, x_t)

        if self.label_smoothing:
            loss = self._label_smoothed_sce(y_pred,
                                            y_t,
                                            eps=self.config.smooth_eps,
                                            itf=self.itf,
                                            ignore_label=self.config.pad_id)
        else:
            loss = F.softmax_cross_entropy(y_pred,
                                           y_t,
                                           ignore_label=self.config.pad_id)

        accuracy = F.accuracy(y_pred, y_t, ignore_label=self.config.pad_id)
        perplexity = self.xp.exp(loss.data)
        # print("loss: {}, perp: {}, acc: {}".format(loss.data, perplexity, accuracy.data))
        chainer.report(
            {
                "loss": loss.data,
                "perp": perplexity,
                "acc": accuracy.data
            }, self)
        return loss

    def translate(self, x_s, max_length=65, beam=None):
        """
            args
                x_s: list of source sentences.
                    ["こんにちは", "あああああ", ...]
                max_length: max times of auto-regression
                beam: beam breadth in beam-search
                    '0' or 'None' means 'don't use beam-search'.
            returns
                translated: list of inferenced sentence(type:String) list.
        """
        batch_size = len(x_s)
        x_s = self.tokenizer.tokenize_sentences(x_s)
        x_s = self._get_padded_sentence(x_s, self.config.pad_id)
        x_t = self.xp.array([self.config.bos_id] * batch_size,
                            dtype=np.int32).reshape(batch_size, 1)
        eos_flags = self.xp.zeros((batch_size, 1), dtype=np.int32)
        y_pred, enc_out = self.forward(x_s, x_t, translate=True)

        with chainer.no_backprop_mode():
            with chainer.using_config("train", False):
                if beam:
                    # first search
                    # x_t, x_s shape: (batch, length) -> (batch*beam, length)
                    x_t = self.xp.concatenate([x_t[:, None, :]] * beam,
                                              axis=1).reshape(
                                                  beam * batch_size, 1)
                    x_s = self.xp.concatenate([x_s[:, None, :]] * beam,
                                              axis=1).reshape(
                                                  beam * batch_size,
                                                  x_s.shape[1])
                    scores = self.xp.zeros((batch_size * beam),
                                           dtype=np.float32)
                    candidates, s = self._get_beam_results(
                        y_pred.data, beam, 1)
                    scores += s
                    x_t = self.xp.concatenate([x_t, candidates[..., None]],
                                              axis=1)

                    x_t = self._beam_translate(max_length - 2, x_s, x_t, None,
                                               scores, max_length, beam)

                else:
                    x_t = self.xp.concatenate(
                        [x_t,
                         self.xp.argmax(y_pred.data, axis=1)[..., None]],
                        axis=1)

                    for i in range(max_length - 1):
                        y_pred = self._translate_forward(enc_out, x_s, x_t)
                        #print(i, self.xp.mean(y_pred.data), self.xp.max(y_pred.data), self.xp.min(y_pred.data))
                        y_inds = self.xp.argmax(y_pred.data,
                                                axis=1)[i + 1::i + 2, None]
                        x_t = self.xp.concatenate([x_t, y_inds], axis=1)
                        eos_flags += (y_inds == self.config.eos_id)
                        if self.xp.all(eos_flags):
                            break

        translated = [[] for i in range(batch_size)]
        for b, sentence in enumerate(x_t[:, 1:]):
            for w in sentence:
                if w == self.config.eos_id:
                    break
                translated[b].append(w)

        translated = self.tokenizer.detokenize_sentences(translated)
        return translated

    def _beam_translate(self, depth, x_s, x_t, enc_out, scores, max_length,
                        beam):
        """recurrent beam search for translate.
            args
                depth: controll inferencing depth.
                    (this function perform recurrently)
                x_s: array of source sentences. (batch*beam, length)
                    Note this x_s is not the same as arg of 'translate' function.
                x_t: array of target sentences. (batch*beam, length)
                    this arg changes gradually in auto-regression.
                enc_out: encoder's output (fixed after calculated once)
                scores: candidates scores for selecting good output.
                max_length: max times of auto-regression.
                beam: beam breadth in beam-search.
            returns
                x_t: predicted (intermediate) sentence.
        """
        batch_size = len(x_t)
        if depth == max_length - 2:
            # y_pred shapes (batch*beam*2, vocab_size), and get candidates from y_pred
            y_pred, enc_out = self.forward(x_s, x_t, translate=True)
        else:
            y_pred = self._translate_forward(enc_out, x_s, x_t)

        candidates, s = self._get_beam_results(y_pred.data, beam,
                                               max_length - depth)

        # x_t shape -> (batch*beam*beam, L) -> (batch, beam*beam, L)
        x_t = self.xp.concatenate([x_t[:, None, :]] * beam, axis=1)
        x_t = x_t.reshape(beam * batch_size, max_length - depth)
        x_t = self.xp.concatenate([x_t, candidates[..., None]], axis=1)
        x_t = x_t.reshape(batch_size // beam, beam * beam,
                          max_length - depth + 1)

        # score the same as x_t
        scores = self.xp.concatenate([scores[:, None]] * beam, axis=1)
        scores = scores.reshape(beam * batch_size, )
        scores += s
        scores = scores.reshape(batch_size // beam, beam * beam)

        if depth == 0:
            best_sentence_ind = self.xp.argmax(scores, axis=1)
            x_t = x_t[self.xp.arange(batch_size // beam), best_sentence_ind]
            return x_t

        # sorting by scores, getting sentence-candidates for next depth.
        beam_indeces = self.xp.argsort(scores, axis=1)[:, ::-1][:, :beam]
        beam_indeces = self.xp.concatenate(beam_indeces, axis=0)
        batch_indeces = self.xp.arange(batch_size // beam)
        batch_indeces = self.xp.concatenate([batch_indeces[..., None]] * beam,
                                            axis=1)
        batch_indeces = batch_indeces.reshape(batch_size, )
        x_t = x_t[batch_indeces, beam_indeces]

        scores = self.xp.sort(scores, axis=1)[:, ::-1][:, :beam]
        scores = self.xp.concatenate(scores, axis=0)

        if self.xp.all(self.xp.any(x_t == 2, axis=1)):
            scores = scores.reshape(batch_size // beam, beam)
            best_sentence_ind = self.xp.argmax(scores, axis=1)
            x_t = x_t.reshape(batch_size // beam, beam, x_t.shape[1])
            x_t = x_t[self.xp.arange(batch_size // beam), best_sentence_ind]
            return x_t

        x_t = self._beam_translate(depth - 1, x_s, x_t, enc_out, scores,
                                   max_length, beam)

        return x_t

    def _get_beam_results(self, y_pred, beam, position):
        """beam results should be (batch*beam, length).
            args
                y_pred: decoder's output in auto-regression.
                beam: beam size of candidate getting
                position: specify where candidates should be get from.
                    if position is 2, <> position below will be candidates.
                    [<batch_0>, batch_1, <batch_2>, batch_3, ..., <batch_2n>]
            returns
                candidates: top beam-th candidates on y_pred.
                scores: top beam-th scores on y_pred.
        """
        candidates = self.xp.argsort(y_pred)[:, ::-1][position -
                                                      1::position, :beam]
        candidates = self.xp.concatenate(candidates, axis=0)
        scores = self.xp.sort(y_pred)[:, ::-1][position - 1::position, :beam]
        scores = self.xp.concatenate(scores, axis=0)

        return candidates, scores

    def _translate_forward(self, enc_out, x_s, x_t):
        """reusing enc_out for efficient calculation.
            args
                enc_out: encoder's output (fixed after calculated once)
                x_s: array of source sentences.
                    Note this x_s is not the same as arg of 'translate' function.
                x_t: array of target sentences.
                    this arg changes gradually in auto-regression.
            returns
                dec_out: decoder's output
        """
        length_t = x_t.shape[1]
        h_t = self.target_embed(x_t)
        h_t += self.position_encoding[None, :length_t]
        h_t = F.transpose(h_t, (0, 2, 1))

        tgt_self_mask = self._get_padding_mask(x_t, x_t, self.config.pad_id)
        tgt_future_mask = self._get_future_mask(x_t)
        tgt_self_mask *= tgt_future_mask
        src_tgt_mask = self._get_padding_mask(x_s, x_t, self.config.pad_id)

        dec_out = self.dec(h_t, enc_out, tgt_self_mask, src_tgt_mask)

        B, D, L = dec_out.shape
        dec_out = F.transpose(dec_out, (0, 2, 1)).reshape(B * L, D)
        dec_out = F.linear(dec_out, self.target_embed.W)

        return dec_out

    def _init_position_encoding(self, max_length, unit_num):
        half_dim = unit_num // 2
        dim_positions = -(np.arange(half_dim) * 2 / unit_num)
        dim_positions = 10000**dim_positions

        word_positions = np.arange(max_length)
        general_encode = word_positions[..., None] * dim_positions[None, ...]
        even_dims = np.sin(general_encode)
        odd_dims = np.cos(general_encode)

        position_encoding = np.concatenate(
            [even_dims[..., None], odd_dims[..., None]], axis=2)
        position_encoding = position_encoding.reshape(max_length, unit_num)

        return position_encoding.astype(np.float32)

    def _get_padded_sentence(self, xs, pad_id, eos_id=None):
        batch_size = len(xs)
        max_length = max([len(x) for x in xs])

        if eos_id:
            padded_sentence = self.xp.full((batch_size, max_length + 2),
                                           pad_id,
                                           dtype=np.int32)
            for i, x in enumerate(xs):
                x_eos = x + [eos_id]
                padded_sentence[i, :len(x_eos)] = self.xp.array(x_eos,
                                                                dtype=np.int32)
        else:
            padded_sentence = self.xp.full((batch_size, max_length),
                                           pad_id,
                                           dtype=np.int32)
            for i, x in enumerate(xs):
                padded_sentence[i, :len(x)] = self.xp.array(x, dtype=np.int32)

        return padded_sentence

    def _get_padding_mask(self, key, query, pad_id):
        """
            args
                key: key in attention.
                    in source-target attention, this means 'source'
                    shape is (batch, length).
                query: query in attention.
                    in source-target attention, this means 'target'
                    shape is (batch, length).
            returns
                mask: (batch, q-length, k-length) shape xp-array.
        """
        query_mask = query != pad_id
        key_mask = key != pad_id
        mask = key_mask[:, None, :] * query_mask[..., None]
        return mask

    def _get_future_mask(self, x):
        """
            args
                x: target's input array
                    shape is (batch, length)
            returns
                mask: mask for future-ignoring.
                    when batch is 1 and length is 4,
                    [[[ True, False, False, False],
                      [ True,  True, False, False],
                      [ True,  True,  True, False],
                      [ True,  True,  True,  True]]]
                    will be return.
        """
        batch, length = x.shape
        arange = self.xp.arange(length)
        future_mask = (arange[None, ] <= arange[:, None])[None, ...]
        future_mask = self.xp.concatenate([future_mask] * batch, axis=0)
        return future_mask

    def _label_smoothed_sce(self, y, t, eps, itf, ignore_label=None):
        """note: variable 'batch_size' means batch*length of the task.
            args
                y: model output (batch*length, vocab_size)
                t: ground truth (batch*length, )
                    this value is index of truth word in vocab.
                eps: epsilon for label-smoothing.
                itf: array of inverse token frequency.
                ignore_label: word whitch should be ignored for calculation.
            returns
                loss: loss (Variable) between y and label-smoothed-t.
        """
        xp = chainer.cuda.get_array_module(t)
        batch_size, vocab_size = y.shape
        func_u = eps / vocab_size

        smoothed_t = xp.zeros_like(y.data).astype(np.float32)
        smoothed_t[xp.arange(batch_size), t] = 1 - eps  # + func_u
        smoothed_t += func_u

        loss = F.log_softmax(y) * smoothed_t
        normalizer = batch_size
        if ignore_label:
            ignore_mask = t != ignore_label
            normalizer = xp.sum(ignore_mask)
            loss = ignore_mask[..., None] * loss

        loss = loss * self.xp.array(itf[None, ...], dtype=np.float32)
        loss = -F.sum(loss) / normalizer

        return loss
Пример #50
0
 def __init__(self, origin):
     self.tokens = Tokenizer(origin)
     self.tokens.selectNext()
Пример #51
0
class AttentionTFIDFClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 hiddens=300,
                 mindf=2,
                 lan='english',
                 stopwords='nltk',
                 k=512,
                 max_drop=.85,
                 batch_size=64,
                 lr=5e-3,
                 weight_decay=5e-3,
                 nepochs=1000,
                 patience=10,
                 factor=.95,
                 vocab_max_size=300000,
                 n_jobs=cpu_count(),
                 _device=torch.device('cuda:0'),
                 _verbose=False):
        super(AttentionTFIDFClassifier, self).__init__()

        self._model = None
        self._tokenizer = None
        self.nepochs = int(nepochs)
        self.hiddens = int(hiddens)
        self.mindf = int(mindf)
        self.lan = lan
        self.stopwords = stopwords
        self.k = int(k)
        self.max_drop = max_drop
        self.vocab_max_size = vocab_max_size
        self._verbose = _verbose
        self._device = _device

        self.n_jobs = int(n_jobs)

        self.lr = lr
        self.weight_decay = weight_decay
        self.patience = int(patience)
        self.factor = factor
        self.batch_size = int(batch_size)

        def collate_train(param):
            X, y = zip(*param)
            y = self._tokenizer.le.transform(y)
            doc_tids, TFs, DFs = self._tokenizer.transform(X, verbose=False)

            doc_tids = pad_sequence(list(map(torch.LongTensor, doc_tids)),
                                    batch_first=True,
                                    padding_value=0)

            TFs = pad_sequence(list(map(torch.tensor, TFs)),
                               batch_first=True,
                               padding_value=0)
            TFs = torch.LongTensor(torch.log2(TFs + 1).round().long())

            DFs = pad_sequence(list(map(torch.tensor, DFs)),
                               batch_first=True,
                               padding_value=0)
            DFs = torch.LongTensor(torch.log2(DFs + 1).round().long())

            return doc_tids, TFs, DFs, torch.LongTensor(y)

        def collate_predict(X):
            doc_tids, TFs, DFs = self._tokenizer.transform(X, verbose=False)

            doc_tids = pad_sequence(list(map(torch.LongTensor, doc_tids)),
                                    batch_first=True,
                                    padding_value=0)

            TFs = pad_sequence(list(map(torch.tensor, TFs)),
                               batch_first=True,
                               padding_value=0)
            TFs = torch.LongTensor(torch.log2(TFs + 1).round().long())

            DFs = pad_sequence(list(map(torch.tensor, DFs)),
                               batch_first=True,
                               padding_value=0)
            DFs = torch.LongTensor(torch.log2(DFs + 1).round().long())

            return doc_tids, TFs, DFs

        self.collate_train = collate_train
        self.collate_predict = collate_predict

    def fit(self, X_train, y_train, X_val=None, y_val=None):
        if X_val is None or y_val is None:
            pass
        self._tokenizer = Tokenizer(mindf=self.mindf,
                                    lan=self.lan,
                                    stopwordsSet=self.stopwords,
                                    model='sample',
                                    k=self.k,
                                    verbose=self._verbose)
        self._tokenizer.fit(X_train, y_train)

        self.maxF = int(round(np.log2(self._tokenizer.maxF + 1)))

        self._model = AttentionTFIDF(vocab_size=self._tokenizer.vocab_size,
                                     hiddens=self.hiddens,
                                     nclass=self._tokenizer.n_class,
                                     maxF=self.maxF,
                                     drop=self.max_drop).to(self._device)

        optimizer = optim.AdamW(self._model.parameters(),
                                lr=self.lr,
                                weight_decay=self.weight_decay)
        loss_func_cel = nn.CrossEntropyLoss().to(self._device)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=self.factor,
            patience=3,
            verbose=self._verbose)

        best = 99999.
        best_acc = 0.
        counter = 1
        dl_val = DataLoader(list(zip(X_val, y_val)),
                            batch_size=self.batch_size,
                            shuffle=False,
                            collate_fn=self.collate_train,
                            num_workers=self.n_jobs)

        for e in tqdm(range(self.nepochs),
                      total=self.nepochs,
                      disable=not self._verbose):
            dl_train = DataLoader(list(zip(X_train, y_train)),
                                  batch_size=self.batch_size,
                                  shuffle=True,
                                  collate_fn=self.collate_train,
                                  num_workers=self.n_jobs)
            loss_train = 0.
            with tqdm(total=len(y_train) + len(y_val),
                      smoothing=0.,
                      desc=f"ACC_val: {best_acc:.2} Epoch {e+1}",
                      disable=not self._verbose) as pbar:
                total = 0
                correct = 0
                self._model.train()
                self._tokenizer.model = 'sample'
                for i, (doc_tids, TFs, DFs, y) in enumerate(dl_train):

                    doc_tids = doc_tids.to(self._device)
                    TFs = TFs.to(self._device)
                    DFs = DFs.to(self._device)
                    y = y.to(self._device)

                    pred_docs, _, _ = self._model(doc_tids, TFs, DFs)
                    pred_docs = torch.softmax(pred_docs, dim=1)
                    loss = loss_func_cel(pred_docs, y)

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    loss_train += loss.item()
                    total += len(y)
                    y_pred = pred_docs.argmax(axis=1)
                    correct += (y_pred == y).sum().item()
                    self._model.drop_ = (correct / total) * self.max_drop

                    pbar.update(len(y))
                    del doc_tids, TFs
                    del DFs, y, pred_docs
                    del loss, y_pred
                loss_train = loss_train / (i + 1)
                total = 0
                correct = 0
                self._model.eval()
                self._tokenizer.model = 'topk'
                with torch.no_grad():
                    loss_val = 0.
                    for i, (doc_tids, TFs, DFs, y) in enumerate(dl_val):
                        doc_tids = doc_tids.to(self._device)
                        TFs = TFs.to(self._device)
                        DFs = DFs.to(self._device)
                        y = y.to(self._device)

                        pred_docs, _, _ = self._model(doc_tids, TFs, DFs)
                        pred_docs = torch.softmax(pred_docs, dim=1)
                        loss = loss_func_cel(pred_docs, y)

                        loss_val += loss.item()
                        total += len(y)
                        y_pred = pred_docs.argmax(axis=1)
                        correct += (y_pred == y).sum().item()
                        pbar.update(len(y))
                        loss_val
                        del doc_tids, TFs, DFs, y
                        del pred_docs, loss
                    loss_val = (loss_val / (i + 1))
                    scheduler.step(loss_val)

                    if best - loss_val > 0.0001:
                        best = loss_val
                        counter = 1
                        best_acc = correct / total
                        best_model = copy.deepcopy(self._model).to('cpu')
                    elif counter > self.patience:
                        break
                    else:
                        counter += 1

        self._model = best_model.to(self._device)

        self._loss = best
        self._acc = best_acc

        return self

    def predict(self, X):
        if self._model is None or self._tokenizer is None:
            raise Exception("Not implemented yet!")
        self._model.eval()
        self._tokenizer.model = 'topk'
        dataloader = DataLoader(X,
                                batch_size=self.batch_size,
                                shuffle=False,
                                collate_fn=self.collate_predict,
                                num_workers=self.n_jobs)
        result = []
        with torch.no_grad():
            loss_val = 0.
            for i, (doc_tids, TFs, DFs) in enumerate(dataloader):
                doc_tids = doc_tids.to(self._device)
                TFs = TFs.to(self._device)
                DFs = DFs.to(self._device)

                pred_docs, _, _ = self._model(doc_tids, TFs, DFs)
                pred_docs = torch.softmax(
                    pred_docs, dim=1).argmax(axis=1).cpu().detach().numpy()
                result.extend(list(pred_docs))
        return self._tokenizer.le.inverse_transform(np.array(result))

    def to(self, device):
        self._device = device
        if self._model is not None:
            self._model.to(self._device)
        return self
Пример #52
0
from tokenizer import Tokenizer
from transformer.frontend import NLP
import os

#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

num_merges = 40000
tokenization_path = f"tokenization_{num_merges}.json"
input_path = 'input.txt'

tokenizer = Tokenizer()
if os.path.exists(tokenization_path):
    tokenizer.load(tokenization_path)
else:
    tokenizer.from_file(input_path, num_merges)
    tokenizer.save(tokenization_path)

nlp = NLP(tokenizer,
          maximum_position_encoding=1000,
          d_model=64,
          num_layers=5,
          dff=1024,
          num_heads=8)
with open(input_path, 'r', encoding='utf-8') as f:
    nlp.train(f.read(), prev_tokens=128, epochs=10, evaluate_str=[
        'XD',
    ])
str = input()
nlp.generate_text(str, length=200)
#while True:
#    output, in_tokens, translated_tokens, attention_weights = nlp.evaluate(str)
Пример #53
0
config = run.config

df = pd.read_csv('tweets.csv')
target = df['is_there_an_emotion_directed_at_a_brand_or_product']
text = df['tweet_text']

fixed_text = text[pd.notnull(text)]
fixed_target = target[pd.notnull(text)]

w2v = {}
with open("glove/glove.6B.50d.txt", "r") as lines:
    for line in lines:
        word, numbers = line.split(" ", 1)
        number_array = np.array(numbers.split()).astype(np.float)
        w2v[word] = number_array

text_clf = Pipeline([
    ('token', Tokenizer()),
    ('vect', MeanEmbeddingVectorizer(w2v)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200)),
])

text_clf.fit(fixed_text, fixed_target)

scores = cross_val_score(text_clf, fixed_text, fixed_target)
print(scores)
print(scores.mean())

predictions = cross_val_predict(text_clf, fixed_text, fixed_target)
log(run, fixed_text, fixed_target, predictions)
 def __init__(self):
     self.src_vocab = Tokenizer.en_vocab_create()
     self.trg_vocab = Tokenizer.ja_vocab_create()
import numpy as np
from tqdm import tqdm
import librosa
from scipy.io import wavfile
from sklearn.model_selection import train_test_split

import hyperparameter as hp
from tokenizer import Tokenizer
from data_hdf5 import HDF5DatasetWriter
from audio import Audio

MAX_LEN_TEXT = 300
MAX_LEN_AUDIO = 1595

audio = Audio(hp)
tokenizer = Tokenizer(alphabet=hp.alphabet)

data_list = os.listdir(hp.data_path)
tokens = []

audio_links = []
label_links = []


def process_wav(wav_path):
    y, sr = audio.load_wav(wav_path)
    mel = audio.mel_spectrogram(y)
    assert mel.shape[1] == audio.config.mel_channels, len(mel.shape) == 2
    start_token = np.ones((1, hp.mel_channels)) * hp.mel_start_value
    end_token = np.ones((1, hp.mel_channels)) * hp.mel_end_value
    mel = np.concatenate([start_token, mel], 0)
 def compute_token_ids(self):
     parser = Tokenizer(self.args.token_args)
     return parser.token2id()
Пример #57
0
    def __init__(self,
                 encoder_model_dimension: int,
                 decoder_model_dimension: int,
                 encoder_num_heads: list,
                 decoder_num_heads: list,
                 encoder_maximum_position_encoding: int,
                 decoder_maximum_position_encoding: int,
                 encoder_dense_blocks: int,
                 decoder_dense_blocks: int,
                 encoder_prenet_dimension: int,
                 decoder_prenet_dimension: int,
                 postnet_conv_filters: int,
                 postnet_conv_layers: int,
                 postnet_kernel_size: int,
                 dropout_rate: float,
                 mel_start_value: float,
                 mel_end_value: float,
                 mel_channels: int,
                 encoder_attention_conv_filters: int = None,
                 decoder_attention_conv_filters: int = None,
                 encoder_attention_conv_kernel: int = None,
                 decoder_attention_conv_kernel: int = None,
                 encoder_feed_forward_dimension: int = None,
                 decoder_feed_forward_dimension: int = None,
                 decoder_prenet_dropout=0.5,
                 max_r: int = 10,
                 **kwargs):
        super(AutoregressiveTransformer, self).__init__(**kwargs)
        self.start_vec = tf.ones((1, mel_channels), dtype=tf.float32) * mel_start_value
        self.end_vec = tf.ones((1, mel_channels), dtype=tf.float32) * mel_end_value
        self.stop_prob_index = 2
        self.max_r = max_r
        self.r = max_r
        self.mel_channels = mel_channels
        self.drop_n_heads = 0
        self.tokenizer = Tokenizer(alphabet=hp.alphabet)
        self.encoder_prenet = tf.keras.layers.Embedding(self.tokenizer.vocab_size,
                                                        encoder_prenet_dimension,
                                                        name='Embedding')
        self.encoder = SelfAttentionBlocks(model_dim=encoder_model_dimension,
                                           dropout_rate=dropout_rate,
                                           num_heads=encoder_num_heads,
                                           feed_forward_dimension=encoder_feed_forward_dimension,
                                           maximum_position_encoding=encoder_maximum_position_encoding,
                                           dense_blocks=encoder_dense_blocks,
                                           conv_filters=encoder_attention_conv_filters,
                                           kernel_size=encoder_attention_conv_kernel,
                                           conv_activation='relu',
                                           name='Encoder')
        self.decoder_prenet = DecoderPrenet(model_dim=decoder_model_dimension,
                                            dense_hidden_units=decoder_prenet_dimension,
                                            dropout_rate=decoder_prenet_dropout,
                                            name='DecoderPrenet')
        self.decoder = CrossAttentionBlocks(model_dim=decoder_model_dimension,
                                            dropout_rate=dropout_rate,
                                            num_heads=decoder_num_heads,
                                            feed_forward_dimension=decoder_feed_forward_dimension,
                                            maximum_position_encoding=decoder_maximum_position_encoding,
                                            dense_blocks=decoder_dense_blocks,
                                            conv_filters=decoder_attention_conv_filters,
                                            conv_kernel=decoder_attention_conv_kernel,
                                            conv_activation='relu',
                                            conv_padding='causal',
                                            name='Decoder')
        self.final_proj_mel = tf.keras.layers.Dense(self.mel_channels * self.max_r, name='FinalProj')
        self.decoder_postnet = Postnet(mel_channels=mel_channels,
                                       conv_filters=postnet_conv_filters,
                                       conv_layers=postnet_conv_layers,
                                       kernel_size=postnet_kernel_size,
                                       name='Postnet')

        self.training_input_signature = [
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None), dtype=tf.int32)
        ]
        self.forward_input_signature = [
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
        ]
        self.encoder_signature = [
            tf.TensorSpec(shape=(None, None), dtype=tf.int32)
        ]
        self.decoder_signature = [
            tf.TensorSpec(shape=(None, None, encoder_model_dimension), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32),
        ]
Пример #58
0
    for token_sequence in sequences:
        words.extend(token_sequence)

    word_counts = dict(Counter(words).most_common(max_words))

    most_common_words = list(word_counts.keys())
    word_ids = list(range(len(most_common_words)))

    vocabulary = dict(zip(most_common_words, word_ids))
    return vocabulary


sentences = np.genfromtxt('./tickets_QIT.txt', dtype=str, delimiter='\n')

prep = TextPreprocessor(sentences)
prep = QITEmailBodyCleaner(prep)
prep = Tokenizer(prep, language='italian')
tokens = prep.preprocess()
vocabulary = build_vocabulary(tokens)

unknown_token_id = max(vocabulary.values()) + 1
prep = IntegerEncoder(prep, vocabulary, unknown_token_id)
prep = WordContextPairsGenerator(prep, window_length=2)

word_context_pairs = prep.preprocess()
target_words = [tw for (tw, cw) in word_context_pairs]
context_words = [cw for (tw, cw) in word_context_pairs]

np.savetxt('target_words.txt', target_words, fmt='%d')
np.savetxt('context_words.txt', context_words, fmt='%d')
    f_xml = xml.dom.minidom.parseString(root_str)
    pretty_xml_as_string = f_xml.toprettyxml(encoding="utf-8")
    f = open(out_file, 'wb')
    f.write(pretty_xml_as_string)
    f.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('rsd_input',
                        type=str,
                        help='input rsd file path or directory.')
    parser.add_argument('ltf_output',
                        type=str,
                        help='output ltf file path or directory.')
    t = Tokenizer()
    parser.add_argument(
        '--seg_option',
        default='linebreak',
        help="segmentation options: %s (default is linebreak)" %
        ', '.join(t.segmenters.keys()))
    parser.add_argument('--tok_option',
                        default='unitok',
                        help="tokenization options: %s (default is unitok)" %
                        ', '.join(t.tokenizers.keys()))
    parser.add_argument('--extension',
                        default='.rsd.txt',
                        help="extension of rsd file")
    parser.add_argument('--re_segment',
                        action='store_true',
                        default=False,
Пример #60
0
        small_diff - максимальная допустимая разность между двумя вариантами правила с наибольшей вероятностью,          
        """
        return [self.parse_sent(sentence, radius, suff_len, small_diff, process_cases) for sentence in self.make_sents(self.lemmatize(tokens))]

if __name__ == "__main__":

    filename = os.path.join(os.path.dirname(sys.argv[0]), "test/freview.txt")
    trainfile = os.path.join(os.path.dirname(sys.argv[0]),"dicts/ruscorpora.txt.lemma")
    prepsfile = os.path.join(os.path.dirname(sys.argv[0]),"corpora/preps_stat.txt")
    
    print "STARTED:", str(datetime.now())
    start = time.time()

    morph = get_morph(os.path.join(os.path.dirname(sys.argv[0]),"pydicts").decode("UTF8"))  # Подгружаем русский словарь
    morph_simple = get_morph(os.path.join(os.path.dirname(sys.argv[0]),"pydicts").decode("UTF8"), check_prefixes=False) # Подгружаем русский словарь - 2
    tok = Tokenizer()   # Подгружаем токенизатор
    dater = Dater() # Подгружаем обработчик дат
    tagger = Tagger(morph, morph_simple, dater)  # Подгружаем тэггер
    #t = time.time()
    #tagger.prepare_cases(trainfile)
    #print "Cases prepared! It took", time.time() - t
    #t = time.time()
    #tagger.train_cases(trainfile + ".cases") # Обучаем тэггер падежам
    #print "Cases trained! It took", time.time() - t
    tagger.prepare_corpus(trainfile, 3)
    tagger.prepare_corpus(trainfile, 4)
    tagger.prepare_corpus(trainfile, 5)
    print "Corpus prepared!"
    tagger.train(trainfile + ".03.suffs", 3) # Обучаем тэггер суффиксам
    tagger.load_statistics(trainfile, 3)   # Загружаем суффиксную статистику  
    #tagger.dump_preps(prepsfile)   # Выписываем правила падежей в зависимости от предлогов в текстовый файл