def ext_json(): rdfUrl = '' tok = Tokenizer() if request.method == 'POST': rdf = request.form['data'] status_test = "0"#request.form['status'] filters = ""#request.form['exculdeurls'] #rdf = "http://jpp.no-ip.org/MAD_J.rdf" try: #r = requests.get(rdf) gg = Graph() #g.load(rdfUrl) rdf_content = StringIO.StringIO(rdf.encode('utf-8')) #print rdf_content.readline() gg.parse(rdf_content, format="xml") ext = Extractor(gg) uris = ext.getUris() mapping = MapFactory() for uri in uris: term = tok.tokenized_url(uri) uri_status = "" if status_test == "1": uri_status = ext.testUri(uri) else: uri_status = "N/A" uri_lookup = str(uri)+"\"" lnum = ext.get_lines(rdf_content, uri_lookup) ent = MapEntry(uri, term, "", lnum, uri_status) mapping.add(ent) jsonized_result = json.dumps(mapping.get()) return Response(jsonized_result, mimetype='application/json') except requests.exceptions.ConnectionError: X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!' return X2Rwarning
def main(): ## args parser = argparse.ArgumentParser() parser.add_argument('-r', '--reviews', required=True, help='Review data file') parser.add_argument('-o', '--out', required=True, help='Inverted index output file') parser.add_argument('-s', '--stop', required=True, help='Stopword list') opts = parser.parse_args() ## Output file csv_writer = csv.writer(open(opts.out, 'w'), delimiter="\t") csv_writer.writerow(['token', 'business_id', 'review_id', 'position', '...']) ## Tokenizer tk = Tokenizer(opts.stop) token_map = defaultdict(list) ## Tokenize review texts # for each word in the vocabulary (in this case all words found in all reviews): # business id, review id, and position of each term occurrence # instead of using the review id, uses the line on which the review occurs as a unique identifier reviews = open(opts.reviews) for review_num, line in enumerate(reviews): review = json.loads(line) business_id = review['business_id'].encode('utf-8') tokens = tk.tokenize(review['text']) for position, word in enumerate(tokens): token_map[word].append((business_id, review_num, position)) ## Print sorted inverted index for token in sorted(token_map): row = [token] row.extend(token_map[token]) csv_writer.writerow(row)
def _tokenize_tweet(self, tweet): """ Input: tweet (String) Output: List of tokens """ tok = Tokenizer(preserve_case=False) return tok.tokenize(tweet)
def main(args): try: (opts, args) = getopt(args, "o:TPX") except GetoptError: usage() if len(args) != 1: usage() from tokenizer import Tokenizer from parser import Parser from error import JtError import context from os.path import abspath filename = abspath(args[0]) stdin = file(filename, "r") target = "P" stdout = sys.stdout for (ok, ov) in opts: if ok in ("-T", "-P", "-X"): target = ok[1] elif ok == "-o": stdout = file(ov, "w") contents = stdin.read() tokenizer = Tokenizer() tokenizer.build() tokenizer.input(contents) parser = Parser(tokenizer) result_tree = None try: result_tree = parser.parse() except JtError, error: failure(error)
def execute(self): if len(self.proj_paths) > 0: logging.info('Starting tokenizer. Producibles (logs, output, etc) can be found under the name '+self.target_folders) tokenizer = Tokenizer(self.proj_paths, self.DB_user, self.DB_pass, self.DB_name, logging, self.logs_folder, self.output_folder, self.N_PROCESSES, self.BATCH_SIZE, self.PROJECTS_CONFIGURATION) tokenizer.execute() else: logging.warning('The list of new projects is empty (or these are already on the DB).')
def tokenize(self, **kwargs): """ Returns the tokenized string using a parser. """ string_tokenizer = Tokenizer() return string_tokenizer.tokenize(kwargs.get("text"), kwargs.get("parser"))
def main(): tok = Tokenizer() mapping = MapFactory() uris = ["http://abc.ee.ntu/alf_123", "http://sc.e.ncli.ABCdefGU"] for uri in uris: term = tok.tokenized_url(uri) ent = MapEntry(uri, term, "", "", "") mapping.add(ent) jsonized_result = json.dumps(mapping.get()) print jsonized_result
def interpret_line(self, line): tokenizer = Tokenizer() tokenizer.parse(line) first_token = tokenizer.getNextToken() if first_token.type == Token.NUMBER: self.lines[int(first_token.value)] = tokenizer.prog[tokenizer.pos:] self.sort_lines() else: self.run_line(line)
def testExecutionTreeWithItemAssignment(self): c = ExpressionCompiler() tokenizer = Tokenizer() tokenizer.tokenize("A[B]= 1 + R") tokenizer.next() expr = c.compile(tokenizer) exec_tree = expr.get_execution_tree() print "Expression Tree %s\n" % (exec_tree) self.assertEqual( "( = ( item_assign ( literal A ) ( index ( literal B ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )", exec_tree ) # a little bit more complex tokenizer.tokenize("A[B+(C*3)+1]= 1 + R") tokenizer.next() expr = c.compile(tokenizer) exec_tree = expr.get_execution_tree() print "Expression Tree %s\n" % (exec_tree) self.assertEqual( "( = ( item_assign ( literal A ) ( index ( + ( + ( literal B ) ( * ( literal C ) ( literal 3.0 ) ) ) ( literal 1.0 ) ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )", exec_tree, )
def testEvaluateFactors(self): c = ExpressionCompiler() tokenizer = Tokenizer() tokenizer.tokenize("7*7") tokenizer.next() expr = c.compile(tokenizer) result = expr.evaluate() print "result = %s\n" % (result) self.assertEqual(49.0, result) tokenizer.tokenize("7*7/7") tokenizer.next() expr = c.compile(tokenizer) result = expr.evaluate() print "result = %s\n" % (result) self.assertEqual(7.0, result)
def main(): # first read in the inverted index file parser = argparse.ArgumentParser() parser.add_argument('-index', required=True, help='Path to inverted index file') parser.add_argument('-business', required=False, help='Path to yelp business data json file', default="/course/cs1951a/pub/final/data/extracted/yelp_academic_dataset_business.json") opts = parser.parse_args() # Pre-processing f_index = open(opts.index,'r') print "loading index file..." wordsmap = {} # count = 0 # for line in f_index: # count += 1 # j_obj = json.load(line) # for k, v in j_obj.items(): # wordsmap[k] = v # j_obj = None # if count % 100 == 0: # print count wordsmap = json.load(f_index) print "done" f_index.close() b_map = {} print "loading business file..." f_b = open(opts.business, 'r') line_num = 0 for line in f_b: b_json = json.loads(line) b_map[str(line_num)]={"business_id":b_json['business_id'],"review_count":int(b_json['review_count']), "stars":float(b_json['stars'])} line_num += 1 print "done" tokenizer = Tokenizer() # TODO: need to check error input # Bug: c-d exit situation for line in sys.stdin: result = [] line = line.strip('\n') if len(line)==0: continue elif line[0]=='"': line = line.strip('"') words = tokenizer.process_review(line) result = phrase_query(words, wordsmap) elif len(line.split())==1: words = tokenizer.process_review(line) result = one_word_query(words[0], wordsmap) else: words = tokenizer.process_review(line) result = free_text_query(words, wordsmap) rank_res = rank(words,result,b_map,wordsmap) print rank_res
def __init__( self, string_to_tokenize = '', prefix_chars = '-=<>!+*&|/%^', suffix_chars = '=<>&|' ): Tokenizer.__init__( self, string_to_tokenize ) self.prefix = prefix_chars self.suffix = suffix_chars ### Setup JavaScriptTokenizer-specific regexen self.PREFIX = re.compile( "[%s]" % self.prefix ) self.SUFFIX = re.compile( "[%s]" % self.suffix ) self.BEGIN_IDENTIFIER = self.CHARACTER self.MULTILINE_COMMENT = re.compile("[\*]") self.END_COMMENT = re.compile("[/]") self.ESCAPE = re.compile("[\\\\]")
def correct_macro_syntax_test(): macro_string = """ !config { output: pdf, html table_of_contents: true }""" tokenizer = Tokenizer(macro_string) for token in tokenizer: if token[0] == "!": open_brackets = tokenizer.next() if open_brackets != "{": raise DMLSyntaxError(open_brackets, "{")
def test_ast_opts(self): a = AST() t = Tokenizer() opts = {} opts['get-me'] = 'I am superman' a.parse(t.parse('{{ opts.get("get-me") }}')) c = a.traverse(opts=opts) self.assertEqual(c.buffer, 'I am superman') a.parse(t.parse('{@ if opts.get("get-me"): @}I am superman{@ end @}')) c = a.traverse(opts=opts) self.assertEqual(c.buffer, 'I am superman')
def __init__(self, _what, _who, _when, _where, _why, _how, _text): self.what = Tokenizer.removeNonAscii(_what).replace(".\"",". \"") self.who = Tokenizer.removeNonAscii(_who).replace(".\"",". \"") self.when = Tokenizer.removeNonAscii(_when).replace(".\"",". \"") self.where = Tokenizer.removeNonAscii(_where).replace(".\"",". \"") self.why = Tokenizer.removeNonAscii(_why).replace(".\"",". \"") self.how = Tokenizer.removeNonAscii(_how).replace(".\"",". \"") self.text = Tokenizer.removeNonAscii(_text).replace(".\"",". \"") self.sentences = Tokenizer.getSentences(self.text) self.tokenized_sentences = [Tokenizer.getTokens(sentence) for sentence in self.sentences]
def analyze(string): scanner = Tokenizer() list_of_tokens= scanner.tokenize(string) print "-------------" print "TOKEN LIST:" print list_of_tokens parser = QueryParser() print "----------------" print "PARSING RESULT" print "----------------" print parser.parse(list_of_tokens) semparser = QuerySemanticParser(parser.parse(list_of_tokens)) semparser.parse()
def __init__( self, string_to_tokenize = '' ): Tokenizer.__init__( self, string_to_tokenize ) ### Setup CSSTokenizer-specific regexen ### Throwing everything away after reading through the CSS spec. ### I ought be using the specified tokens, so I will. # IDENT {ident} # ATKEYWORD @{ident} # STRING {string} # INVALID {invalid} # HASH #{name} # NUMBER {num} # PERCENTAGE {num}% # DIMENSION {num}{ident} # URI url\({w}{string}{w}\) # |url\({w}([!#$%&*-~]|{nonascii}|{escape})*{w}\) # UNICODE-RANGE U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})? # CDO <!-- # CDC --> # ; ; # { \{ # } \} # ( \( # ) \) # [ \[ # ] \] # S [ \t\r\n\f]+ # COMMENT \/\*[^*]*\*+([^/*][^*]*\*+)*\/ # FUNCTION {ident}\( # INCLUDES ~= # DASHMATCH |= # DELIM any other character not matched by the above rules, and neither a single nor a double quote # # # ident [-]?{nmstart}{nmchar}* # name {nmchar}+ # nmstart [_a-z]|{nonascii}|{escape} # nonascii [^\0-\177] # unicode \\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])? # escape {unicode}|\\[^\n\r\f0-9a-f] # nmchar [_a-z0-9-]|{nonascii}|{escape} # num [0-9]+|[0-9]*\.[0-9]+ # string {string1}|{string2} # string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" # string2 \'([^\n\r\f\\']|\\{nl}|{escape})*\' # invalid {invalid1}|{invalid2} # invalid1 \"([^\n\r\f\\"]|\\{nl}|{escape})* # invalid2 \'([^\n\r\f\\']|\\{nl}|{escape})* # nl \n|\r\n|\r|\f # w [ \t\r\n\f]*
def testEvaluateNegation(self): c = ExpressionCompiler() tokenizer = Tokenizer() tokenizer.tokenize("not 0") tokenizer.next() expr = c.compile(tokenizer) result = expr.evaluate() print "result = %s\n" % (result) self.assertEqual(1, result)
def interpretStatement(self): tokens = Tokenizer(self.IR) instr = tokens.next().lower() stmt = "" while tokens.peek() is not None: stmt += tokens.next() if instr[0] == 's': self.interpretSet(stmt) elif instr[0] == 'j': if len(instr) == 5: self.interpretJumpt(stmt) elif len(instr) == 4: self.interpretJump(stmt) elif instr[0] == 'h': self.halt(tokens)
class Parser(object): def __init__(self, stmt): # We always wrap with ()'s self.tnz = Tokenizer('(' + stmt + ')') def pop(self): return self.tnz.pop() def peek(self): return self.tnz.peek() def top(self): return self.tnz.top() def parse(self, indent=0): indent = deepcopy(indent) indent += 1 if istype(self.top(), 'Lparen'): self.pop() # Open paren n = self.parse(indent) cp = self.pop() # Close paren if istype(self.top(), 'Bop'): bopr = Node(self.pop(), indent) bopr.l_child = n bopr.r_child = self.parse(indent) return bopr else: return n if istype(self.top(), 'Term'): if istype(self.peek(), 'Bop'): t1 = Node(self.pop(), indent) bopr = Node(self.pop(), indent) bopr.l_child = t1 if istype(self.top(), 'Term'): bopr.r_child = self.parse(indent) elif istype(self.top(), 'Lparen'): bopr.r_child = self.parse(indent) else: raise SyntaxError("Expected Term or (") return bopr elif istype(self.peek(), 'Rparen'): t1 = Node(self.pop(), indent) return t1 elif istype(self.peek(), 'Term'): t1 = Node(self.pop(), indent) return t1 else: raise SyntaxError("Expecting term or (")
def _classify(self, tokens, languages): """ Internal: Guess language of data data - Array of tokens or String data to analyze. languages - Array of language name Strings to restrict to. Returns sorted Array of result pairs. Each pair contains the String language name and a Float score. """ if tokens is None: return [] if isinstance(tokens, basestring): tokens = Tokenizer.tokenize(tokens) scores = {} if self.verbosity >= 2: self.dump_all_tokens(tokens, languages) for language in languages: scores[language] = self.tokens_probability(tokens, language) + self.language_probability(language) if self.verbosity >= 1: print '%10s = %10.3f + %7.3f = %10.3f\n' % (language, self.tokens_probability(tokens, language), self.language_probability(language), scores[language]) return sorted(scores.iteritems(), key=lambda t: t[1], reverse=True)
def train(cls, db, language, data): """ Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token, per-language. See also dump_all_tokens, below. Public: Train classifier that data is a certain language. db - Hash classifier database object language - String language of data data - String contents of file Examples Classifier.train(db, 'Ruby', "def hello; end") Returns nothing. """ tokens = Tokenizer.tokenize(data) db['tokens_total'] = db.get('tokens_total', 0) db['languages_total'] = db.get('languages_total', 0) db['tokens'] = db.get('tokens', {}) db['language_tokens'] = db.get('language_tokens', {}) db['languages'] = db.get('languages', {}) for token in tokens: db['tokens'][language] = db['tokens'].get(language, {}) db['tokens'][language][token] = db['tokens'][language].get(token, 0) db['tokens'][language][token] += 1 db['language_tokens'][language] = db['language_tokens'].get(language, 0) db['language_tokens'][language] += 1 db['tokens_total'] += 1 db['languages'][language] = db['languages'].get(language, 0) db['languages'][language] += 1 db['languages_total'] += 1
def getOtherTaggedText(info): taggedtext = TextMarker.getTaggedText(info) # print taggedtext # print '' btags2 = ['B_WHAT', 'B_WHO', 'B_WHEN', 'B_WHERE', 'B_WHY', 'B_HOW'] etags2 = ['E_WHAT', 'E_WHO', 'E_WHEN', 'E_WHERE', 'E_WHY', 'E_HOW'] for i, tag in enumerate(btags2): taggedtext = taggedtext.replace(TextMarker.btags[i], tag) for i, tag in enumerate(etags2): taggedtext = taggedtext.replace(TextMarker.etags[i], tag) text = "" state = 0 for token in Tokenizer.getTokens(taggedtext): if (reduce( (lambda x, y: x or y), list(map((lambda x: x in token), btags2)) )): state += len([item for item in list(map((lambda x: x in token), btags2)) if item]) if (state==0): # print "%s\t%s" % (state, TextMarker.othertags[0] + token + TextMarker.othertags[1]) text += TextMarker.othertags[0] + token + TextMarker.othertags[1] else: # print "%s\t%s" % (state, token) text += token + " " if (reduce( (lambda x, y: x or y), list(map((lambda x: x in token), etags2)) )): state -= len([item for item in list(map((lambda x: x in token), etags2)) if item]) for i, tag in enumerate(TextMarker.btags): text = text.replace(btags2[i], tag) for i, tag in enumerate(TextMarker.etags): text = text.replace(etags2[i], tag) return text
class Preprocesser: def __init__(self, lower=True, punctuation=True, digits=True, stop=True, min_length=3, pos_tag=False, lemmatization=True): self.lemma = lemmatization self.pos_tag = pos_tag self.tokenizer = Tokenizer(lower, punctuation, digits) self.token_filter = TokenFilter(stop, min_length) if pos_tag or lemmatization: self.postagger = Postagger() print dir(self.postagger) if lemmatization: self.Lemmatizer = Lemmatizer() def process(self, text): words = self.tokenizer.tokenize(text) words = self.token_filter.filter(words) if self.lemma: tags = self.postagger.tags2lemmatags(self.postagger.tags(words)) result = self.Lemmatizer.lemma(words, tags) if self.pos_tag: tags = self.postagger.tags(words) result = tags return result
def train(cls, db, language, data): """ Public: Train classifier that data is a certain language. db - Hash classifier database object language - String language of data data - String contents of file Examples Classifier.train(db, 'Ruby', "def hello; end") Returns nothing. """ tokens = Tokenizer.tokenize(data) db['tokens_total'] = db.get('tokens_total', 0) db['languages_total'] = db.get('languages_total', 0) db['tokens'] = db.get('tokens', {}) db['language_tokens'] = db.get('language_tokens', {}) db['languages'] = db.get('languages', {}) for token in tokens: db['tokens'][language] = db['tokens'].get(language, {}) db['tokens'][language][token] = db['tokens'][language].get(token, 0) db['tokens'][language][token] += 1 db['language_tokens'][language] = db['language_tokens'].get(language, 0) db['language_tokens'][language] += 1 db['tokens_total'] += 1 db['languages'][language] = db['languages'].get(language, 0) db['languages'][language] += 1 db['languages_total'] += 1
def __init__(self, expression): """Initialize the parser by generating the token sequence""" self.sc = Scanner(expression) self.tok = Tokenizer(self.sc) self.tokens = None self.tokens = self.get_token_sequence() self.root = None
def simple_english_tokenizer(tokenizer=None): if not tokenizer: tokenizer = Tokenizer() word = tokenizer.type['Word'] = RegexTokenType(r'(\w+\'\w+|\w+)', priority=0) punctuation = tokenizer.type['Punctuation'] = RegexTokenType(r'([^\w\s%s]+)', priority=1) tokenizer.joins = { (punctuation,word,'\'') : '', (punctuation,word) : ' ', (punctuation,punctuation) : '', (word,word) : ' ', None : '' } return tokenizer
def main(): parser = argparse.ArgumentParser() parser.add_argument('-review_file', required=True, help='Path to review data') parser.add_argument('-business_file', required=True, help='Path to business data') parser.add_argument('-output', required=True, help='Path to output index file') opts = parser.parse_args() f_reviews = open(opts.review_file,'r') f_business = open(opts.business_file,'r') line_num = 0 b_map = {} for line in f_business: b_obj = json.loads(line) b_map[b_obj['business_id']] = line_num line_num += 1 tokenizer = Tokenizer() wordsmap = {} line_num = 0 for line in f_reviews: r = json.loads(line) words = tokenizer.process_review(r['text']); w_idx = 0 for w in words: if w=="": continue b_id = b_map[r['business_id']] if w in wordsmap: if b_id in wordsmap: b_map = wordsmap[w][b_id] if line_num in b_map: b_map[line_num].append(w_idx) else: b_map[line_num] = [w_idx] else: wordsmap[w][b_id] = {line_num:[w_idx]} else: wordsmap[w] = {b_id:{line_num:[w_idx]}} w_idx += 1 line_num += 1 if line_num % 1000==0: print line_num # if line_num == 1000: # break with open(opts.output, 'w') as f_out: json.dump(wordsmap, f_out)
class InvertedIndex(): def __init__(self): self.invertedindex = {} self.lexicon = Lexicon() self.tokenizer = Tokenizer() self.doc_reader = DocReader() self.build_index() def build_index(self): #comments? cache = self.doc_reader.get_cache() docs = self.doc_reader.read_docs(cache) print "\nINVERTEDINDEX : Indexing %d documents..\n" % len(docs) for d in range(len(docs)): print "Indexing document '%s'" % (settings.PATH_DOCS + str(d)) self.add_document(docs[d], d) print "Indexed total %d unique terms" % self.lexicon.size() def get_postinglist(self, lex_id): return self.invertedindex[lex_id] def add_document(self, doc, document_id): """FIXME: -Needs doc -Too slow? -Remove stop words -Reduce number of tokens """ tokens = self.tokenizer.tokenize(doc) for t in tokens: lex_id = self.lexicon.lookup(t.get_value()) if(lex_id == settings.INVALID): lex_id = self.lexicon.add_value(t.get_value()) pl = PostingList() pl.append_posting(Posting(document_id, t.get_position())) self.invertedindex[lex_id] = pl else: pl = self.get_postinglist(lex_id) if pl.get_last_posting().get_document_id() != document_id: pl.append_posting(Posting(document_id, t.get_position())) else: p = pl.get_last_posting() p.append_position(t.get_position()) def size(self): return len(self.invertedindex) def debugprint(self): voc = self.lexicon.get_vocabulary() for v in voc: lid = self.lexicon.lookup(v) pl = self.get_postinglist(lid) print "[%s]" % v pl.info()
def process(a, s): infilename = a['src_filename'] outfilename = Tokenizer.batch_tokenise( config['src_lang'], config['moses_installation_dir'], infilename, config['src_tokenisation_dir']) return {'tokenised_src_filename':outfilename}
def rsd2ltf(rsd_str, doc_id, seg_option='linebreak', tok_option='unitok', re_segment=False): tokenizer = Tokenizer(seg_option, tok_option) if re_segment: # running segmentation and tokenization, then re-segment the tokenized # sentences (use space to concatenate tokens. this solves segmentation # problem, e.g. How are you?I'm fine.). # print('=> running segmentation...') sents = tokenizer.run_segmenter(rsd_str) # print('=> running tokenization...') raw_tokens = tokenizer.run_tokenizer(sents) # re-segment tokenized sentence num_sent_reseg = 0 tokens = [] for i, t in enumerate(raw_tokens): reseg = [ item.split() for item in tokenizer.run_segmenter(' '.join(t)) ] if len(reseg) > 1: num_sent_reseg += 1 tokens += reseg # compute offset for each token indexer = 0 token_offset = [] for i, t in enumerate(itertools.chain(*tokens)): while not rsd_str[indexer:].startswith(t) and \ indexer < len(rsd_str): indexer += 1 if indexer < len(rsd_str): t_start = indexer t_end = t_start + len(t) - 1 assert rsd_str[t_start:t_end + 1] == t, \ "re_segment token offset not match %s-%d" % (doc_id, i) token_offset.append((t_start, t_end)) indexer = t_end + 1 assert len(token_offset) == len(list(itertools.chain(*tokens))), \ "re_segment tokenization offset error in: %s" % doc_id # recover sent using tokens sents = [] prev_token_end = token_offset[0][0] - 1 token_index = 0 for i, t in enumerate(tokens): sent = '' for j, item in enumerate(t): if j == 0: prev_token_end = token_offset[token_index][0] - 1 sent += ' ' * (token_offset[token_index][0] - prev_token_end - 1) + item prev_token_end = token_offset[token_index][1] token_index += 1 assert sent in rsd_str, \ 're_segment sentence offset error.' sents.append(sent) else: # running segmentation and tokenization # print('=> running segmentation...') sents = tokenizer.run_segmenter(rsd_str) # print('=> running tokenization...') tokens = tokenizer.run_tokenizer(sents) # generate offset for sentences and tokens # print('=> generating offset...') indexer = 0 sent_offset = [] for i, s in enumerate(sents): while not rsd_str[indexer:].startswith(s) and indexer < len(rsd_str): indexer += 1 if indexer < len(rsd_str): sent_start = indexer sent_end = sent_start + len(s) - 1 assert rsd_str[sent_start:sent_end+1] == s, \ "sentence offset not match %s-%d" % (doc_id, i) sent_offset.append((sent_start, sent_end)) indexer = sent_end + 1 assert len(sent_offset) == len(sents), \ "sentence segmentation offset error in: %s" % doc_id token_offsets = [] for i, tok in enumerate(tokens): sent_text = sents[i] indexer = 0 t_offset = [] for j, t in enumerate(tok): while not sent_text[indexer:].startswith(t) and \ indexer < len(sent_text): indexer += 1 if indexer < len(sent_text): t_start = indexer t_end = t_start + len(t) - 1 assert sent_text[t_start:t_end+1] == t, \ "token offset not match %s-%d-%d" % (doc_id, i, j) t_offset.append((t_start, t_end)) indexer = t_end + 1 token_offsets.append(t_offset) assert len(t_offset) == len(tok), \ "tokenization offset error in: %s-%d" % (doc_id, i) # convert seg/tok result to ltf root = ET.Element('LCTL_TEXT') doc_element = ET.Element('DOC', {'id': doc_id}) text_element = ET.Element('TEXT') root.append(doc_element) doc_element.append(text_element) for i in range(len(sents)): seg_text = sents[i] seg_start_char = sent_offset[i][0] seg_end_char = sent_offset[i][1] seg_id = '%s-%s' % (doc_id, str(i)) seg_element = ET.Element( 'SEG', { 'id': seg_id, 'start_char': str(seg_start_char), 'end_char': str(seg_end_char) }) original_text_element = ET.Element('ORIGINAL_TEXT') original_text_element.text = seg_text seg_element.append(original_text_element) for j in range(len(tokens[i])): token_id = 'token-%d-%d' % (i, j) tok_text = tokens[i][j] if not tok_text: continue tok_start_char = int(token_offsets[i][j][0]) + seg_start_char tok_end_char = int(token_offsets[i][j][1]) + seg_start_char assert rsd_str[tok_start_char:tok_end_char + 1] == tok_text token_element = ET.Element( 'TOKEN', { 'id': token_id, 'start_char': str(tok_start_char), 'end_char': str(tok_end_char) }) token_element.text = tok_text seg_element.append(token_element) text_element.append(seg_element) return root
from collections import Counter import numpy as np sentences = np.genfromtxt('../upsampled/x_QIT.txt', delimiter='\n', dtype=str) language = 'italian' max_words = None max_length = 25 # Text preprocessor with no functionalities whatsoever prep = TextPreprocessor(sentences) # Add decorator to clean email bodies prep = QITEmailBodyCleaner(prep) # Add tokenizer decorator prep = Tokenizer(prep, language) # Load vocabulary with open('vocabulary_wikipedia', 'r') as vocabulary_file: vocabulary = eval(vocabulary_file.read()) # Add integer encoding decorator unknown_token_id = max(vocabulary.values()) + 1 prep = IntegerEncoder(prep, vocabulary, unknown_token_id) # Add padding decorator padding_token_id = max(vocabulary.values()) + 2 prep = Padder(prep, padding_token_id, max_length) # Get final tokens final_tokens = prep.preprocess()
class TokenizerTest(unittest.TestCase): """Unit test case suite for our tokenizers in our Tokenizer class.""" def setUp(self): """General setup for configuration files.""" # configuration for human readable Tokenizer human_readable_config = config_pb2.Config() human_readable_config.clusterer.tokenizer.token_min_length = 2 human_readable_config.clusterer.tokenizer.mode = config_pb2.Tokenizer.TokenizerMode.HUMAN_READABLE human_readable_config.clusterer.tokenizer.split_on.extend(['=']) human_readable_config.clusterer.tokenizer.punctuation.extend( [':', '/', '\n', '\t']) self.human_readable_tokenizer = Tokenizer(human_readable_config) # configuration for stack trace Tokenizer stack_trace_config = config_pb2.Config() stack_trace_config.clusterer.tokenizer.token_min_length = 0 stack_trace_config.clusterer.tokenizer.mode = config_pb2.Tokenizer.TokenizerMode.STACK_TRACE_LINES self.stack_trace_tokenizer = Tokenizer(stack_trace_config) ignore_test_config = config_pb2.Config() ignore_test_config.clusterer.tokenizer.token_min_length = 2 ignore_test_config.clusterer.tokenizer.mode = config_pb2.Tokenizer.TokenizerMode.HUMAN_READABLE ignore_test_config.clusterer.tokenizer.ignore_token_matcher.extend( ['uselessInfo']) self.ignore_test_config = Tokenizer(ignore_test_config) super(TokenizerTest, self).setUp() def test_human_readable_tokenizer(self): """Test suite for human_readable_tokenizer.""" # our tokenizer gets rid of sequences of numbers and keeps 'words' simple_string = 'subscription id 11444512 failed because it was cancelled' simple_tokens = [ 'subscription', 'id', 'failed', 'because', 'it', 'was', 'cancelled' ] self.assertEqual( self.human_readable_tokenizer.human_readable_tokenizer(simple_string), simple_tokens) # our configured tokenizer also splits on '=', # extracting subscription=1114125 to subscription, # 1114125 the later of which is removed extra_split_test = 'subscription=1114125 failed because of id=1124125 from client=STADIA' split_tokens = [ 'subscription', 'failed', 'because', 'of', 'id', 'from', 'client', 'stadia' ] self.assertEqual( self.human_readable_tokenizer.human_readable_tokenizer( extra_split_test), split_tokens) # Extracting useful text & removing stack lines test stack_trace = open('testdata/tokenizer/human_readable_trace.txt').read() stack_trace_tokens = [ 'some', 'hopefully', 'useful', 'english', 'text', 'here' ] self.assertEqual( self.human_readable_tokenizer.human_readable_tokenizer(stack_trace), stack_trace_tokens) def test_stack_trace_line_tokenizer(self): """Test suite for stack_trace_line_tokenizer.""" # example stack trace we would want to extract from sample_stack_trace = open( 'testdata/tokenizer/sample_stack_trace.txt').read() sample_extracted_lines = [ 'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.lambda$getMovementCode$0', 'java.util.Optional.orElseThrow', 'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.getMovementCode', 'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.createRevenueMovement', 'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.addLineItem', 'com.google.moneta.purchaseorder.monetizer.PurchaseOrderUsageTransaction.addAllLineItems', 'com.google.moneta.purchaseorder.service.purchaseorder.purchaseorderinternal.ChargeAction.charge' ] self.assertEqual( self.stack_trace_tokenizer.stack_trace_line_tokenizer( sample_stack_trace), sample_extracted_lines) def test_token_ignore(self): """Test suite to test functionality of ignoring specific tokens.""" sample_string = 'this is useful info, but this is uselessInfo' sample_tokens = ['this', 'is', 'useful', 'info', 'but', 'this', 'is'] self.assertEqual( self.ignore_test_config.human_readable_tokenizer(sample_string), sample_tokens)
def execute_system(p_datafilesdir, p_tokenizefiles=False, p_createindex=True, p_createsnippets=False): print "Executing Task 3A..." startTime = time.time() ldatafilesdir = CACM_DATA if is_string_valid(p_datafilesdir): ldatafilesdir = p_datafilesdir # Variable for no of documents NoOfDocuments = get_no_of_files_in_dir(ldatafilesdir) # Create output directory create_directory(DIR_FOR_OUTPUT_FILES) ldirpathfortask = DIR_FOR_OUTPUT_FILES + "/" + TASK3A_CONST create_directory(ldirpathfortask) ltokenizedfilesdir = DIR_FOR_OUTPUT_FILES + "/" + DIR_FOR_TOKENIZED_FILES if p_tokenizefiles: t1 = time.time() # create tokenizer and generate token documents ltokenizer = Tokenizer() ltokenizer.setTokenizedFilesOutputDir(ltokenizedfilesdir) ltokenizer.tokenizedir(ldatafilesdir) t2 = time.time() # print "Time for Tokenizer Module: " + str(t2-t1) if p_createindex: t1 = time.time() # create instance of Indexer class and create indexes lindexer = Indexer() lindexer.set_tokenized_files_dir(ltokenizedfilesdir) lindexer.setOutputDirectory(TASK3A_CONST) lindexer.startIndexing(True, False) lindexer.printAll() t2 = time.time() # print "Time for Indexer Module: " + str(t2-t1) # Convert query list to query dict lquerydict = get_given_queries_in_dict(CACM_QUERY_FILE + FILE_EXT) lquerydict = get_sorted_dict(lquerydict) t1 = time.time() lrm = RMBase() # Set the no. of documents with the retrieval module lindexfilename = ldirpathfortask + "/" + FILE_FOR_STOPPED_INDEX + FILE_EXT ldocfreqtableforunigramfilename = ldirpathfortask + "/" + FILE_FOR_DOC_FREQ_TABLE + CONSTS_FOR_UNIGRAM + FILE_EXT lwordcountsbyfilefilename = ldirpathfortask + "/" + FILE_FOR_WORD_COUNTS_BY_FILE_FOR + FILE_FOR_STOPPED_INDEX + FILE_EXT lrm.setNoOfDocuments(NoOfDocuments) lrm.setOutputDirectory(TASK3A_CONST) lrm.setCanUseRelevanceInfo(True) lrm.setIndexFileName(lindexfilename) lrm.setDocFreqDictFileName(ldocfreqtableforunigramfilename) lrm.setWordCountsByFileDictFileName(lwordcountsbyfilefilename) lrm.initializeRM() # Process all the queries for the retrieval module lrm.processQueriesFromFile(lquerydict, True) t2 = time.time() # print "Time for Retrieval Module: " + str(t2-t1) endTime = time.time() print "Task 3A execution completed in " + str(endTime - startTime) if p_createsnippets: # Generate snippets for BM25 output generate_snippet(CACM_QUERY_FILE + FILE_EXT, ldirpathfortask + "/" + DIR_FOR_BM25_OUTPUT, ldirpathfortask + "/" + SNIPPET_GEN_RESULTS_FOLDER + CONST_FOR_BM25) # Generate snippets for TF-IDF output generate_snippet(CACM_QUERY_FILE + FILE_EXT, ldirpathfortask + "/" + DIR_FOR_TFIDF_OUTPUT, ldirpathfortask + "/" + SNIPPET_GEN_RESULTS_FOLDER + CONST_FOR_TFIDF)
acc = average_precision_score(y_test, y_) rec = recall_score(y_test, y_) f1 = f1_score(y_test, y_) print('----evaluation done----') return acc, rec, f1 if __name__ == '__main__': # 分好词并去掉停止词的正面与负面评价,各5000条 with open('words_pos') as words_pos, open('words_neg') as words_neg: pos, neg = words_pos.readlines(), words_neg.readlines() stopwords = [ word.strip() for word in open('chinese_stopwords.txt').readlines() ] tokenizer = Tokenizer(stopwords) # 封装的朴素贝叶斯分类器,接受参数为用来生成词汇表的语料 # 这里将训练与测试数据一起用来生成词汇表,防止测试集出现词汇表外的单词的情况 naive_bayes = NaiveBayes(tokenizer, pos + neg) # 测试集保留正面负面评价各500条 x_test = pos[-500:] + neg[-500:] y_test = [1] * 500 + [0] * 500 # 其余全作为训练集 x_train = pos[:4500] + neg[:4500] y_train = [1] * 4500 + [0] * 4500 naive_bayes.fit(x_train, y_train) # 三折交叉验证,但是因为训练数据只有4500条,分为三折,训练集只分配到3000条,降低了最后在测试集的表现,所以最终决定不用三折交叉验证 # P, R, F1 = [], [], []
def tokenize(self): tokenizer = Tokenizer() self.tokens = tokenizer.tokenize(self.source)
def fit(self, X_train, y_train, X_val=None, y_val=None): if X_val is None or y_val is None: pass self._tokenizer = Tokenizer(mindf=self.mindf, lan=self.lan, stopwordsSet=self.stopwords, model='sample', k=self.k, verbose=self._verbose) self._tokenizer.fit(X_train, y_train) self.maxF = int(round(np.log2(self._tokenizer.maxF + 1))) self._model = AttentionTFIDF(vocab_size=self._tokenizer.vocab_size, hiddens=self.hiddens, nclass=self._tokenizer.n_class, maxF=self.maxF, drop=self.max_drop).to(self._device) optimizer = optim.AdamW(self._model.parameters(), lr=self.lr, weight_decay=self.weight_decay) loss_func_cel = nn.CrossEntropyLoss().to(self._device) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=self.factor, patience=3, verbose=self._verbose) best = 99999. best_acc = 0. counter = 1 dl_val = DataLoader(list(zip(X_val, y_val)), batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_train, num_workers=self.n_jobs) for e in tqdm(range(self.nepochs), total=self.nepochs, disable=not self._verbose): dl_train = DataLoader(list(zip(X_train, y_train)), batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_train, num_workers=self.n_jobs) loss_train = 0. with tqdm(total=len(y_train) + len(y_val), smoothing=0., desc=f"ACC_val: {best_acc:.2} Epoch {e+1}", disable=not self._verbose) as pbar: total = 0 correct = 0 self._model.train() self._tokenizer.model = 'sample' for i, (doc_tids, TFs, DFs, y) in enumerate(dl_train): doc_tids = doc_tids.to(self._device) TFs = TFs.to(self._device) DFs = DFs.to(self._device) y = y.to(self._device) pred_docs, _, _ = self._model(doc_tids, TFs, DFs) pred_docs = torch.softmax(pred_docs, dim=1) loss = loss_func_cel(pred_docs, y) optimizer.zero_grad() loss.backward() optimizer.step() loss_train += loss.item() total += len(y) y_pred = pred_docs.argmax(axis=1) correct += (y_pred == y).sum().item() self._model.drop_ = (correct / total) * self.max_drop pbar.update(len(y)) del doc_tids, TFs del DFs, y, pred_docs del loss, y_pred loss_train = loss_train / (i + 1) total = 0 correct = 0 self._model.eval() self._tokenizer.model = 'topk' with torch.no_grad(): loss_val = 0. for i, (doc_tids, TFs, DFs, y) in enumerate(dl_val): doc_tids = doc_tids.to(self._device) TFs = TFs.to(self._device) DFs = DFs.to(self._device) y = y.to(self._device) pred_docs, _, _ = self._model(doc_tids, TFs, DFs) pred_docs = torch.softmax(pred_docs, dim=1) loss = loss_func_cel(pred_docs, y) loss_val += loss.item() total += len(y) y_pred = pred_docs.argmax(axis=1) correct += (y_pred == y).sum().item() pbar.update(len(y)) loss_val del doc_tids, TFs, DFs, y del pred_docs, loss loss_val = (loss_val / (i + 1)) scheduler.step(loss_val) if best - loss_val > 0.0001: best = loss_val counter = 1 best_acc = correct / total best_model = copy.deepcopy(self._model).to('cpu') elif counter > self.patience: break else: counter += 1 self._model = best_model.to(self._device) self._loss = best self._acc = best_acc return self
save_path = "output.csv" # initialize objects print('Initializing objects ...') print('Initializing word embeddings ...') t1 = time.time() # /media/reza/book/dataset/word2vec/GoogleNews-vectors-negative300.bin # word_embeddings = WordEmbeddings("/media/reza/book/dataset/word2vec/GoogleNews-vectors-negative300.bin") word_embeddings = loadWordModel("E:\\dataset\\glove\\glove.6B.50d.txt") # /media/reza/book/Py_Projects/Lample2016-tagger-master/model_tag2vec.txt pos_embeddings = loadWordModel( "E:\\Py_Projects\\Lample2016-tagger-master\\model_tag2vec.txt") t2 = time.time() print('\tTook %f seconds' % (t2 - t1)) print('Initializing tokenizer ...') tokenizer = Tokenizer() print('Initializing vectorizer ...') vectorizer = Vectorizer(word_embeddings, tokenizer) vectorizer_pos = VectorizerPosTags(pos_embeddings) #### training dataset #### # vectorizing ids, train_a_vectors, train_b_vectors, train_gold = vectorizer.vectorize_df( train_df) train_a_pos_vectors, train_b_pos_vectors = vectorizer_pos.vectorize_sentence_pos_df( train_df) train_max_a_length = len(max(train_a_vectors, key=len)) train_max_b_length = len(max(train_b_vectors, key=len)) print('maximum number of tokens per sentence A in training set is %d' % train_max_a_length)
def convertToLaTeX(string): string = transform_environment(string) tokenizer = Tokenizer(scanner=Scanner(string)) parser = Parser(tokenizer=tokenizer) res = str(parser.parseCode()) return res
from tokenizer import Tokenizer from scorer import WordOverlappingScorer, EmbeddingBasedScorer if __name__ == "__main__": # Build segmentor provided print("Building tokenizer") tokenizer = Tokenizer() # Process sentence pairs hyps = [ "映画を見ますか", "映画はどんなのを見ますか", "映画はどれくらい見ますか", "ゴルフは見ますか", "サッカーは見ますか", "ゴルフで好きな選手はいますか" ] refs = ["オリンピックは見ますか"] * len(hyps) hyps = [tokenizer.tokenize(sent) for sent in hyps] refs = [tokenizer.tokenize(sent) for sent in refs] # You can pass `vocab` as an argument to EmbeddingBasedScorer() to load in-vocab words only for accelerating embedding loading. # If `vocab` is None (by default), all the embeddings will be loaded, which will take a longer time. vocab = set() for sent in hyps + refs: for token in sent: vocab.add(token) # Build scorers print("Building scorer") word_overlap_scorer = WordOverlappingScorer() embedding_based_scorer = EmbeddingBasedScorer(vocab=vocab) # Calculate similarities between sentence pairs
def _encode(t : Tokenizer, e : 'TermEncoder', s : str) -> EncodedTerm: return e(t.toTokenList(s))
from tokenizer import Tokenizer from parser2 import Parser source_code = '' with open('examples/main.stp') as stp: source_code = stp.read(1024) tokenizer = Tokenizer(source_code, True) parser = Parser(tokenizer) syntax_tree = parser.parse() if parser.current_level != 0: raise Exception('brackets error') print(syntax_tree)
import os from tokenizer import Tokenizer #directory where the C++ files are dirname = "C++/" #directories inside the C++ directory for f in os.listdir(dirname): dirnameone = dirname + f + "/" dirlist = os.listdir(dirnameone) # individual files for indfiles in dirlist: #complete file path used for tokenization indtoken = dirnameone + indfiles tok = Tokenizer(indtoken) entire_token_stream = tok.full_tokenize() print(entire_token_stream)
class LMFluencyFilter: def __init__(self, lm_type: LMType, language: str, tokenizer_command): """ lm_type: LMType language: language code tokenizer_command: tokenizer full command (with flags if needed) """ self.language = language self.tokenizer = Tokenizer(tokenizer_command, self.language) self.normalizer = MosesPunctNormalizer(lang=self.language) self.type = lm_type @classmethod def _ispunctuation(cls, t): return all(not c.isalnum() for c in t) @classmethod def _replace_placeholder(cls, t): if t.isalpha(): unicodeGroup = UnicodeWordClassifier.classify_word(t) if t.islower(): return "TOKEN:ALPHA:LOWER:" + unicodeGroup elif t.istitle(): return "TOKEN:ALPHA:TITLE:" + unicodeGroup elif t.isupper(): return "TOKEN:ALPHA:UPPER:" + unicodeGroup else: return "TOKEN:ALPHA:MIXED:" + unicodeGroup else: if t.isnumeric(): return "TOKEN:NUMERIC" elif cls._ispunctuation(t): return t else: return "TOKEN:MIXED" @classmethod def _estimate_kenlm(cls, corpus: str, lm_file: str, params: str): output = subprocess.run("lmplz " + params + " < " + corpus + " > " + lm_file + ".arpa", shell=True, stderr=PIPE, stdout=PIPE) logging.debug(output.stderr.decode()) logging.debug(output.stdout.decode()) output = subprocess.run("build_binary " + lm_file + ".arpa " + lm_file, shell=True, stderr=PIPE, stdout=PIPE) logging.debug(output.stderr.decode()) logging.debug(output.stdout.decode()) def load_lm(self, lm_path: str): self.lm_path = lm_path self.lm = kenlm.LanguageModel(self.lm_path) # def _sentence_split(self,sentence:str): # return self.splitter([sentence]) def _tokenize(self, sentence): sentence = self.normalizer.normalize(sentence) if self.type != LMType.CHARACTER: tokline = " ".join(self.tokenizer.tokenize(sentence)) else: tokline = " ".join(["SPACE" if c == " " else c for c in sentence]) return tokline def _introduce_placeholders(self, sentence): if self.type != LMType.PLACEHOLDER: return sentence else: toks = self._replace_placeholder(sentence) return " ".join(toks) def train_lm(self, text_path: str): tokenized_f = NamedTemporaryFile("w", delete=False) placeholderized_f = NamedTemporaryFile("w", delete=False) #Tokenize text with open(text_path) as input_f: for line in input_f: #line=line.rstrip("\n") tokline = self._tokenize(line) tokenized_f.write(tokline) tokenized_f.write("\n") tokenized_f.close() #Perform placeholder replacement if needed with open(tokenized_f.name) as tokenized_ff: for line in tokenized_ff: line = line.rstrip("\n") with_placeholders = self._introduce_placeholders(line) logging.debug( "Processed training example: {}".format(with_placeholders)) placeholderized_f.write(with_placeholders) placeholderized_f.write("\n") placeholderized_f.close() #Estimate LM lm_file = NamedTemporaryFile(delete=False) lm_file.close() if self.type == LMType.CHARACTER: params = "-o 7 --discount_fallback" else: params = "-o 7 --discount_fallback" self._estimate_kenlm(placeholderized_f.name, lm_file.name, params) self.lm_path = lm_file.name self.lm = kenlm.LanguageModel(self.lm_path) #Remove temporary files os.remove(tokenized_f.name) os.remove(placeholderized_f.name) def copy_lm(self, dst: str): shutil.copyfile(self.lm_path, dst) def cleanup(self): os.remove(self.lm_path) def _raw_score(self, sentence: str): return self.lm.score(sentence) @classmethod def estimate_threshold(cls, filter_a, filter_b, dev_corpus_a: str, dev_corpus_b: str): scores = [] with open(dev_corpus_a) as corpus_a_f, open( dev_corpus_b) as corpus_b_f: for linea, lineb in zip(corpus_a_f, corpus_b_f): linea = linea.rstrip("\n") lineb = lineb.rstrip("\n") scores.append(filter_a.score(linea) + filter_b.score(lineb)) return numpy.mean(scores), numpy.std(scores) def score(self, sentence: str): #We need to preprocess the sentence in the same way as when training the LM #sents= self._sentence_split(sentence) #processed_sents=[self._introduce_placeholders(self._tokenize(s)) for s in sents] processed_sent = self._introduce_placeholders(self._tokenize(sentence)) logging.debug("Scoring: {}".format(processed_sent)) raw_score = self._raw_score(processed_sent) #Normalize score #return sum(raw_scores)/(sum([len(s.split()) for s in processed_sents]) + len(processed_sents) ) # We divide by total number of tokens + 1 for each sentence (taken from kenlm perplexity method) return raw_score / (sum([len(processed_sent.split())]) + 1 ) #the same, but assuming only 1 sentence
writer = tf.io.TFRecordWriter(output_file) prev_text_a = None query_id = -1 # assuming continguous examples for same text_a. for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) tf_example = create_tf_example(example, tokenizer) writer.write(tf_example.SerializeToString()) writer.close() tf.logging.info("Done write tfrecords to %s" % output_file) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--input_file', type=str) parser.add_argument('--output_file', type=str) parser.add_argument('--idx_text', type=int, default=1) parser.add_argument('--idx_label', type=int, default=6) args, _ = parser.parse_known_args() input_file = args.input_file output_file = args.output_file idx_text = args.idx_text idx_label = args.idx_label train_examples = get_train_examples(input_file, idx_text, idx_label) tf.logging.info("Number of train examples is %d" % len(train_examples)) tokenizer = Tokenizer(Config.vocab_file) if not os.path.exists(output_file): file_based_convert_examples_to_tfrecord(train_examples, tokenizer, output_file)
seed_everything, BalancedDataLoader, make_train_data_from_txt, make_itf) logging.basicConfig(level=logging.INFO) if __name__ == '__main__': logging.info('*** Initializing ***') if not os.path.isdir(Config.data_dir): os.mkdir(Config.data_dir) seed_everything(Config.seed) device = torch.device(Config.device) start_epoch = 0 tokenizer = Tokenizer.from_pretrained(Config.model_name) logging.info('Preparing training data') if Config.use_pickle: with open(f'{Config.pickle_path}', 'rb') as f: train_data = pickle.load(f) else: train_data = make_train_data_from_txt(Config, tokenizer) itf = make_itf(train_data, Config.vocab_size) dataset = DialogDataset(train_data, tokenizer) logging.info('Define Models') model = build_model(Config).to(device) state_dict = torch.load(f'{Config.data_dir}/{Config.fn}.pth') model.load_state_dict(state_dict['model']) model.unfreeze()
__author__ = 'Levon' from interpreter import interpreter from tokenizer import Tokenizer from tree import parseTree pT = parseTree() tok = Tokenizer() interp = interpreter() Tree = pT.buildParseTree(tok.tokenize("1+2")) assert (interp.evaluate(Tree) == 3) Tree = pT.buildParseTree(tok.tokenize("(5+(2*3+2))-3*((5+6)/2-4)")) assert (interp.evaluate(Tree) == 8.5) Tree = pT.buildParseTree(tok.tokenize("x = 2")) assert (interp.evaluate(Tree) == 2) Tree = pT.buildParseTree(tok.tokenize("y = 4^3")) assert (interp.evaluate(Tree) == 64) Tree = pT.buildParseTree(tok.tokenize("y^x*2-3")) assert (interp.evaluate(Tree) == 8189) Tree = pT.buildParseTree(tok.tokenize("(x+(2*y+2))-y*((5+x)/2-4)")) assert (interp.evaluate(Tree) == 164) Tree = pT.buildParseTree(tok.tokenize("sin(10)")) assert (interp.evaluate(Tree) == -0.5440211108893698) Tree = pT.buildParseTree(tok.tokenize("2^(5+1)")) assert (interp.evaluate(Tree) == 64) Tree = pT.buildParseTree(tok.tokenize("(2+1)^(2+1)")) assert (interp.evaluate(Tree) == 27)
class Parser(): def __init__(self, origin): self.tokens = Tokenizer(origin) self.tokens.selectNext() def parseProgram(self): token = self.tokens.actual if token.type == "program": token = self.tokens.selectNext() if token.type == "IDE": name_program = token.value token = self.tokens.selectNext() if token.type == "SEMI_COLON": self.tokens.selectNext() variables = self.parseVariables() functions = self.parseFunctions() statements = self.parseStatements() result = Program(name_program, [variables, functions, statements]) token = self.tokens.actual if token.type == "END_PROGRAM": pass else: raise ValueError( "Invalid token, expecting a . on position \ {}".format(self.tokens.position)) else: raise ValueError("Invalid token, expecting a semi colon \ or a end on position {}".format(self.tokens.position)) else: raise ValueError("Invalid token, expecting a program on \ position {}".format(self.tokens.position)) return result def parseFunctionCall(self): pass def parseFunctions(self): token = self.tokens.actual result = Funcs(None, []) while True: if token.type == "function": token = self.tokens.selectNext() if token.type == "IDE": function_name = token.value func = FuncDec(function_name, []) self.tokens.selectNext() arguments = self.parseArgumentsFunction(function_name) self.tokens.selectNext() variables = self.parseVariables() functions = self.parseFunctions() statements = self.parseStatements() func.children.append(arguments) func.children.append(variables) func.children.append(functions) func.children.append(statements) result.children.append(func) token = self.tokens.actual else: raise ValueError( "Invalid token, expecting a identifier on position \ {}".format(self.tokens.position)) elif token.type == "begin": return result else: raise ValueError( "Invalid token, expecting a function on position \ {}".format(self.tokens.position)) def parseArgumentsFunction(self, function_name): token = self.tokens.actual if token.type == "OPEN_PAR": list_arguments = [] while True: token = self.tokens.selectNext() if token.type == "IDE": list_arguments.append(token.value) token = self.tokens.selectNext() if token.type == "VAR_DECLARATION": break elif token.type == "COMMA": pass else: raise ValueError( "Invalid token, expecting a : or , on position \ {}".format(self.tokens.position)) else: raise ValueError( "Invalid token, expecting a identifier on position \ {}".format(self.tokens.position)) token = self.tokens.selectNext() if token.type == "TYPE": arguments = VarDec(None, []) for var_name in list_arguments: var_name = StrVal(var_name, []) value = StrVal(token.value, []) variable = BinOp(":", [var_name, value]) arguments.children.append(variable) token = self.tokens.selectNext() if token.type == "CLOSE_PAR": token = self.tokens.selectNext() if token.type == "VAR_DECLARATION": token = self.tokens.selectNext() if token.type == "TYPE": return_var_name = StrVal(function_name, []) return_type = StrVal(token.value, []) variable = BinOp(":", [return_var_name, return_type]) arguments.children.append(variable) token = self.tokens.selectNext() if token.type == "SEMI_COLON": return arguments else: raise ValueError( "Invalid token, expecting a ; on position \ {}".format(self.tokens.position)) else: raise ValueError( "Invalid token, expecting a type on position \ {}".format(self.tokens.position)) else: raise ValueError( "Invalid token, expecting a : on position \ {}".format(self.tokens.position)) else: raise ValueError( "Invalid token, expecting a ) on position \ {}".format(self.tokens.position)) else: raise ValueError("Invalid token, expecting a type on position \ {}".format(self.tokens.position)) else: raise ValueError("Invalid token, expecting a ( on position \ {}".format(self.tokens.position)) def parseVariables(self): token = self.tokens.actual result = VarDec(None, []) if token.type != "begin": if token.type == "var": token = self.tokens.selectNext() while True: list_vars = [] while True: if token.type == "IDE": list_vars.append(token.value) token = self.tokens.selectNext() if token.type == "COMMA": token = self.tokens.selectNext() elif token.type == "VAR_DECLARATION": break else: raise ValueError( "Invalid token, expecting a , or : on position \ {}".format(self.tokens.position)) else: raise ValueError( "Invalid token, expecting a identifier on position \ {}".format(self.tokens.position)) token = self.tokens.selectNext() if token.type == "TYPE": for var_name in list_vars: var_name = StrVal(var_name, []) value = StrVal(token.value, []) variable = BinOp(":", [var_name, value]) result.children.append(variable) token = self.tokens.selectNext() if token.type == "SEMI_COLON": token = self.tokens.selectNext() if token.type == "begin": break elif token.type == "function": break elif token.type == "IDE": pass else: raise ValueError( "Invalid token, expecting a begin \ or identifier on position {}". format(self.tokens.position)) else: raise ValueError( "Invalid token, expecting a ; on position \ {}".format(self.tokens.position)) else: raise ValueError( "Invalid token, expecting a type on position \ {}".format(self.tokens.position)) else: raise ValueError("Invalid token, expecting a var on position \ {}".format(self.tokens.position)) return result def parseStatements(self): token = self.tokens.actual if token.type == "begin": result = Statements(None, []) while True: self.tokens.selectNext() result.children.append(self.parseStatement()) token = self.tokens.actual if token.type == "SEMI_COLON": pass elif token.type == "end": break if self.tokens.actual.type == "end": self.tokens.selectNext() pass else: raise ValueError("Invalid token, expecting a end on \ position {}".format(self.tokens.position)) else: raise ValueError("Invalid token, expecting a begin on \ position {}".format(self.tokens.position)) return result def parseStatement(self): token = self.tokens.actual if token.type == "begin": result = self.parseStatements() elif token.type == "IDE": result = self.parseAtribution() elif token.type == "print": result = self.parsePrint() elif token.type == "if": result = self.parseIf() elif token.type == "while": result = self.parseWhile() else: raise ValueError( "Invalid token, expecting a begin,identifier, print, if or while \ on position {}".format(self.tokens.position)) return result def parseAtribution(self): value1 = StrVal(self.tokens.actual.value, []) token = self.tokens.selectNext() if (token.type == "ATRIBUTE"): token = self.tokens.selectNext() if (token.type == "read"): value2 = self.parseRead() else: value2 = self.parseExpression() result = BinOp(":=", [value1, value2]) else: raise ValueError( "Invalid token, expecting a := on position {}".format( self.tokens.position)) return result def parsePrint(self): token = self.tokens.selectNext() if token.type == "OPEN_PAR": self.tokens.selectNext() value = self.parseExpression() token = self.tokens.actual if token.type == "CLOSE_PAR": result = Print(value, [value]) self.tokens.selectNext() else: raise ValueError( "Invalid token, expecting a ) on position {}".format( self.tokens.position)) else: raise ValueError( "Invalid token, expecting a ( on position {}".format( self.tokens.position)) return result def parseRelExpression(self): self.tokens.selectNext() value1 = self.parseExpression() token = self.tokens.actual if token.type == 'COMP': self.tokens.selectNext() value2 = self.parseExpression() result = BinOp(token.value, [value1, value2]) else: raise ValueError("Invalid token, expecting a <, >, = or != \ on position {}".format(self.tokens.position)) return result def parseIf(self): comp = self.parseRelExpression() token = self.tokens.actual if (token.type == "then"): self.tokens.selectNext() statement1 = self.parseStatement() token = self.tokens.actual if (token.type == "else"): self.tokens.selectNext() statement2 = self.parseStatement() else: statement2 = NoOp(None, []) result = If(None, [comp, statement1, statement2]) else: raise ValueError("Invalid token, expecting a then on \ position {}".format(self.tokens.position)) return result def parseRead(self): token = self.tokens.selectNext() if token.type == "OPEN_PAR": self.tokens.selectNext() token = self.tokens.actual if token.type == "CLOSE_PAR": result = Read(None, []) self.tokens.selectNext() else: raise ValueError( "Invalid token, expecting a ) on position {}".format( self.tokens.position)) else: raise ValueError( "Invalid token, expecting a ( on position {}".format( self.tokens.position)) return result def parseWhile(self): comp = self.parseRelExpression() token = self.tokens.actual if (token.type == "then"): self.tokens.selectNext() statement1 = self.parseStatement() token = self.tokens.actual result = While(None, [comp, statement1]) else: raise ValueError("Invalid token, expecting a then on \ position {}".format(self.tokens.position)) return result def parseExpression(self): result = self.parseTerm() while True: token = self.tokens.actual if token is None: break if token.type == "PLUS": self.tokens.selectNext() second_value = self.parseTerm() result = BinOp("+", [result, second_value]) elif token.type == "MINUS": self.tokens.selectNext() second_value = self.parseTerm() result = BinOp("-", [result, second_value]) elif token.type == "or": self.tokens.selectNext() second_value = self.parseTerm() result = BinOp("or", [result, second_value]) else: break return result def parseTerm(self): result = self.parseFactor() while True: token = self.tokens.actual if token is None: break elif token.type == "MULT": self.tokens.selectNext() second_value = self.parseFactor() result = BinOp("*", [result, second_value]) elif token.type == "DIV": self.tokens.selectNext() second_value = self.parseFactor() result = BinOp("/", [result, second_value]) elif token.type == "and": self.tokens.selectNext() second_value = self.parseFactor() result = BinOp("and", [result, second_value]) else: break return result def parseFactor(self): token = self.tokens.actual if token is None: raise ValueError( "Invalid token, expecting a number or opening parentesis on \ position {}, got NULL".format(self.tokens.position)) if token.type == "int": result = IntVal(token.value, []) self.tokens.selectNext() elif token.type == "boolean": result = BoolVal(token.value, []) self.tokens.selectNext() elif token.type == "OPEN_PAR": self.tokens.selectNext() result = self.parseExpression() token = self.tokens.actual if token.type != "CLOSE_PAR": raise ValueError("Invalid token, missing parentesis close on \ position {}".format(self.tokens.position)) elif token.type == "MINUS": self.tokens.selectNext() result = self.parseFactor() result = UnOp("-", [result]) elif token.type == "not": self.tokens.selectNext() result = self.parseFactor() result = UnOp("not", [result]) elif token.type == "PLUS": self.tokens.selectNext() result = self.parseFactor() elif token.type == "IDE": identifier = token.value token = self.tokens.selectNext() if token.type == "OPEN_PAR": token = self.tokens.selectNext() args = [] while True: if token.type == "CLOSE_PAR": break else: arg = self.parseExpression() args.append(arg) token = self.tokens.actual if token.type == "COMMA": self.tokens.selectNext() pass elif token.type == "CLOSE_PAR": break else: raise ValueError( "Invalid token, expecting a , or ) on \ position {}".format(self.tokens.position)) none_value = IntVal(None, []) args.append(none_value) result = FuncCall(identifier, args) self.tokens.selectNext() else: result = Identifier(identifier, []) else: raise ValueError( "Invalid token, expecting number or opening parentesis on \ position {}".format(self.tokens.position)) return result
class Transformer(chainer.Chain): def __init__(self, config): self.config = config self.label_smoothing = config.label_smoothing self.position_encoding = self._init_position_encoding( config.max_length, config.unit_num) self.tokenizer = Tokenizer(config.tokenizer_dir, config.dict_dir, config.augmentation) frequency = [] with open(config.freq_dir) as f: for line in f: line = line.rstrip() frequency.append(line) self.itf = 1 / (np.array(frequency, dtype=np.float32) + 1)**config.itf_lambda super(Transformer, self).__init__() with self.init_scope(): self.source_embed = L.EmbedID(config.vocab_size, config.unit_num, ignore_label=config.pad_id) self.enc = Encoder(config) self.target_embed = L.EmbedID(config.vocab_size, config.unit_num, ignore_label=config.pad_id) self.dec = Decoder(config) def forward(self, x_s, x_t, translate=False): """ args x_s: array of padded source sentences. x_t: array of padded target sentences. translate: whether this function used for translate or not. returns dec_out: encoder-decoder model's output. enc_out: encoder's output used for translation. """ length_s, length_t = x_s.shape[1], x_t.shape[1] h_s = self.source_embed(x_s) h_t = self.target_embed(x_t) h_s += self.xp.array(self.position_encoding[None, :length_s]) h_t += self.xp.array(self.position_encoding[None, :length_t]) h_s = F.transpose(h_s, (0, 2, 1)) h_t = F.transpose(h_t, (0, 2, 1)) src_self_mask = self._get_padding_mask(x_s, x_s, self.config.pad_id) tgt_self_mask = self._get_padding_mask(x_t, x_t, self.config.pad_id) tgt_future_mask = self._get_future_mask(x_t) tgt_self_mask *= tgt_future_mask src_tgt_mask = self._get_padding_mask(x_s, x_t, self.config.pad_id) enc_out = self.enc(h_s, src_self_mask) dec_out = self.dec(h_t, enc_out, tgt_self_mask, src_tgt_mask) B, D, L = dec_out.shape dec_out = F.transpose(dec_out, (0, 2, 1)).reshape(B * L, D) dec_out = F.linear(dec_out, self.target_embed.W) if translate: return dec_out, enc_out else: return dec_out def __call__(self, x_s, x_t): """ args x_s: list of source sentences ["こんにちは", "あああああ", ...] x_t: list of target sentence ["こんにちは", "アババババ", ...] returns loss: calculated loss (Variable) """ x_s = self.tokenizer.tokenize_sentences(x_s) x_t = self.tokenizer.tokenize_sentences(x_t) x_s = self._get_padded_sentence(x_s, pad_id=self.config.pad_id) x_t = self._get_padded_sentence(x_t, pad_id=self.config.pad_id, eos_id=self.config.eos_id) batch_t, length_t = x_t.shape y_t = copy.deepcopy(x_t).reshape((batch_t * length_t)) bos_ids = self.xp.repeat(self.xp.array([self.config.bos_id], dtype=np.int32), batch_t, axis=0)[..., None] x_t = self.xp.concatenate([bos_ids, x_t[:, :length_t - 1]], axis=1) y_pred = self.forward(x_s, x_t) if self.label_smoothing: loss = self._label_smoothed_sce(y_pred, y_t, eps=self.config.smooth_eps, itf=self.itf, ignore_label=self.config.pad_id) else: loss = F.softmax_cross_entropy(y_pred, y_t, ignore_label=self.config.pad_id) accuracy = F.accuracy(y_pred, y_t, ignore_label=self.config.pad_id) perplexity = self.xp.exp(loss.data) # print("loss: {}, perp: {}, acc: {}".format(loss.data, perplexity, accuracy.data)) chainer.report( { "loss": loss.data, "perp": perplexity, "acc": accuracy.data }, self) return loss def translate(self, x_s, max_length=65, beam=None): """ args x_s: list of source sentences. ["こんにちは", "あああああ", ...] max_length: max times of auto-regression beam: beam breadth in beam-search '0' or 'None' means 'don't use beam-search'. returns translated: list of inferenced sentence(type:String) list. """ batch_size = len(x_s) x_s = self.tokenizer.tokenize_sentences(x_s) x_s = self._get_padded_sentence(x_s, self.config.pad_id) x_t = self.xp.array([self.config.bos_id] * batch_size, dtype=np.int32).reshape(batch_size, 1) eos_flags = self.xp.zeros((batch_size, 1), dtype=np.int32) y_pred, enc_out = self.forward(x_s, x_t, translate=True) with chainer.no_backprop_mode(): with chainer.using_config("train", False): if beam: # first search # x_t, x_s shape: (batch, length) -> (batch*beam, length) x_t = self.xp.concatenate([x_t[:, None, :]] * beam, axis=1).reshape( beam * batch_size, 1) x_s = self.xp.concatenate([x_s[:, None, :]] * beam, axis=1).reshape( beam * batch_size, x_s.shape[1]) scores = self.xp.zeros((batch_size * beam), dtype=np.float32) candidates, s = self._get_beam_results( y_pred.data, beam, 1) scores += s x_t = self.xp.concatenate([x_t, candidates[..., None]], axis=1) x_t = self._beam_translate(max_length - 2, x_s, x_t, None, scores, max_length, beam) else: x_t = self.xp.concatenate( [x_t, self.xp.argmax(y_pred.data, axis=1)[..., None]], axis=1) for i in range(max_length - 1): y_pred = self._translate_forward(enc_out, x_s, x_t) #print(i, self.xp.mean(y_pred.data), self.xp.max(y_pred.data), self.xp.min(y_pred.data)) y_inds = self.xp.argmax(y_pred.data, axis=1)[i + 1::i + 2, None] x_t = self.xp.concatenate([x_t, y_inds], axis=1) eos_flags += (y_inds == self.config.eos_id) if self.xp.all(eos_flags): break translated = [[] for i in range(batch_size)] for b, sentence in enumerate(x_t[:, 1:]): for w in sentence: if w == self.config.eos_id: break translated[b].append(w) translated = self.tokenizer.detokenize_sentences(translated) return translated def _beam_translate(self, depth, x_s, x_t, enc_out, scores, max_length, beam): """recurrent beam search for translate. args depth: controll inferencing depth. (this function perform recurrently) x_s: array of source sentences. (batch*beam, length) Note this x_s is not the same as arg of 'translate' function. x_t: array of target sentences. (batch*beam, length) this arg changes gradually in auto-regression. enc_out: encoder's output (fixed after calculated once) scores: candidates scores for selecting good output. max_length: max times of auto-regression. beam: beam breadth in beam-search. returns x_t: predicted (intermediate) sentence. """ batch_size = len(x_t) if depth == max_length - 2: # y_pred shapes (batch*beam*2, vocab_size), and get candidates from y_pred y_pred, enc_out = self.forward(x_s, x_t, translate=True) else: y_pred = self._translate_forward(enc_out, x_s, x_t) candidates, s = self._get_beam_results(y_pred.data, beam, max_length - depth) # x_t shape -> (batch*beam*beam, L) -> (batch, beam*beam, L) x_t = self.xp.concatenate([x_t[:, None, :]] * beam, axis=1) x_t = x_t.reshape(beam * batch_size, max_length - depth) x_t = self.xp.concatenate([x_t, candidates[..., None]], axis=1) x_t = x_t.reshape(batch_size // beam, beam * beam, max_length - depth + 1) # score the same as x_t scores = self.xp.concatenate([scores[:, None]] * beam, axis=1) scores = scores.reshape(beam * batch_size, ) scores += s scores = scores.reshape(batch_size // beam, beam * beam) if depth == 0: best_sentence_ind = self.xp.argmax(scores, axis=1) x_t = x_t[self.xp.arange(batch_size // beam), best_sentence_ind] return x_t # sorting by scores, getting sentence-candidates for next depth. beam_indeces = self.xp.argsort(scores, axis=1)[:, ::-1][:, :beam] beam_indeces = self.xp.concatenate(beam_indeces, axis=0) batch_indeces = self.xp.arange(batch_size // beam) batch_indeces = self.xp.concatenate([batch_indeces[..., None]] * beam, axis=1) batch_indeces = batch_indeces.reshape(batch_size, ) x_t = x_t[batch_indeces, beam_indeces] scores = self.xp.sort(scores, axis=1)[:, ::-1][:, :beam] scores = self.xp.concatenate(scores, axis=0) if self.xp.all(self.xp.any(x_t == 2, axis=1)): scores = scores.reshape(batch_size // beam, beam) best_sentence_ind = self.xp.argmax(scores, axis=1) x_t = x_t.reshape(batch_size // beam, beam, x_t.shape[1]) x_t = x_t[self.xp.arange(batch_size // beam), best_sentence_ind] return x_t x_t = self._beam_translate(depth - 1, x_s, x_t, enc_out, scores, max_length, beam) return x_t def _get_beam_results(self, y_pred, beam, position): """beam results should be (batch*beam, length). args y_pred: decoder's output in auto-regression. beam: beam size of candidate getting position: specify where candidates should be get from. if position is 2, <> position below will be candidates. [<batch_0>, batch_1, <batch_2>, batch_3, ..., <batch_2n>] returns candidates: top beam-th candidates on y_pred. scores: top beam-th scores on y_pred. """ candidates = self.xp.argsort(y_pred)[:, ::-1][position - 1::position, :beam] candidates = self.xp.concatenate(candidates, axis=0) scores = self.xp.sort(y_pred)[:, ::-1][position - 1::position, :beam] scores = self.xp.concatenate(scores, axis=0) return candidates, scores def _translate_forward(self, enc_out, x_s, x_t): """reusing enc_out for efficient calculation. args enc_out: encoder's output (fixed after calculated once) x_s: array of source sentences. Note this x_s is not the same as arg of 'translate' function. x_t: array of target sentences. this arg changes gradually in auto-regression. returns dec_out: decoder's output """ length_t = x_t.shape[1] h_t = self.target_embed(x_t) h_t += self.position_encoding[None, :length_t] h_t = F.transpose(h_t, (0, 2, 1)) tgt_self_mask = self._get_padding_mask(x_t, x_t, self.config.pad_id) tgt_future_mask = self._get_future_mask(x_t) tgt_self_mask *= tgt_future_mask src_tgt_mask = self._get_padding_mask(x_s, x_t, self.config.pad_id) dec_out = self.dec(h_t, enc_out, tgt_self_mask, src_tgt_mask) B, D, L = dec_out.shape dec_out = F.transpose(dec_out, (0, 2, 1)).reshape(B * L, D) dec_out = F.linear(dec_out, self.target_embed.W) return dec_out def _init_position_encoding(self, max_length, unit_num): half_dim = unit_num // 2 dim_positions = -(np.arange(half_dim) * 2 / unit_num) dim_positions = 10000**dim_positions word_positions = np.arange(max_length) general_encode = word_positions[..., None] * dim_positions[None, ...] even_dims = np.sin(general_encode) odd_dims = np.cos(general_encode) position_encoding = np.concatenate( [even_dims[..., None], odd_dims[..., None]], axis=2) position_encoding = position_encoding.reshape(max_length, unit_num) return position_encoding.astype(np.float32) def _get_padded_sentence(self, xs, pad_id, eos_id=None): batch_size = len(xs) max_length = max([len(x) for x in xs]) if eos_id: padded_sentence = self.xp.full((batch_size, max_length + 2), pad_id, dtype=np.int32) for i, x in enumerate(xs): x_eos = x + [eos_id] padded_sentence[i, :len(x_eos)] = self.xp.array(x_eos, dtype=np.int32) else: padded_sentence = self.xp.full((batch_size, max_length), pad_id, dtype=np.int32) for i, x in enumerate(xs): padded_sentence[i, :len(x)] = self.xp.array(x, dtype=np.int32) return padded_sentence def _get_padding_mask(self, key, query, pad_id): """ args key: key in attention. in source-target attention, this means 'source' shape is (batch, length). query: query in attention. in source-target attention, this means 'target' shape is (batch, length). returns mask: (batch, q-length, k-length) shape xp-array. """ query_mask = query != pad_id key_mask = key != pad_id mask = key_mask[:, None, :] * query_mask[..., None] return mask def _get_future_mask(self, x): """ args x: target's input array shape is (batch, length) returns mask: mask for future-ignoring. when batch is 1 and length is 4, [[[ True, False, False, False], [ True, True, False, False], [ True, True, True, False], [ True, True, True, True]]] will be return. """ batch, length = x.shape arange = self.xp.arange(length) future_mask = (arange[None, ] <= arange[:, None])[None, ...] future_mask = self.xp.concatenate([future_mask] * batch, axis=0) return future_mask def _label_smoothed_sce(self, y, t, eps, itf, ignore_label=None): """note: variable 'batch_size' means batch*length of the task. args y: model output (batch*length, vocab_size) t: ground truth (batch*length, ) this value is index of truth word in vocab. eps: epsilon for label-smoothing. itf: array of inverse token frequency. ignore_label: word whitch should be ignored for calculation. returns loss: loss (Variable) between y and label-smoothed-t. """ xp = chainer.cuda.get_array_module(t) batch_size, vocab_size = y.shape func_u = eps / vocab_size smoothed_t = xp.zeros_like(y.data).astype(np.float32) smoothed_t[xp.arange(batch_size), t] = 1 - eps # + func_u smoothed_t += func_u loss = F.log_softmax(y) * smoothed_t normalizer = batch_size if ignore_label: ignore_mask = t != ignore_label normalizer = xp.sum(ignore_mask) loss = ignore_mask[..., None] * loss loss = loss * self.xp.array(itf[None, ...], dtype=np.float32) loss = -F.sum(loss) / normalizer return loss
def __init__(self, origin): self.tokens = Tokenizer(origin) self.tokens.selectNext()
class AttentionTFIDFClassifier(BaseEstimator, ClassifierMixin): def __init__(self, hiddens=300, mindf=2, lan='english', stopwords='nltk', k=512, max_drop=.85, batch_size=64, lr=5e-3, weight_decay=5e-3, nepochs=1000, patience=10, factor=.95, vocab_max_size=300000, n_jobs=cpu_count(), _device=torch.device('cuda:0'), _verbose=False): super(AttentionTFIDFClassifier, self).__init__() self._model = None self._tokenizer = None self.nepochs = int(nepochs) self.hiddens = int(hiddens) self.mindf = int(mindf) self.lan = lan self.stopwords = stopwords self.k = int(k) self.max_drop = max_drop self.vocab_max_size = vocab_max_size self._verbose = _verbose self._device = _device self.n_jobs = int(n_jobs) self.lr = lr self.weight_decay = weight_decay self.patience = int(patience) self.factor = factor self.batch_size = int(batch_size) def collate_train(param): X, y = zip(*param) y = self._tokenizer.le.transform(y) doc_tids, TFs, DFs = self._tokenizer.transform(X, verbose=False) doc_tids = pad_sequence(list(map(torch.LongTensor, doc_tids)), batch_first=True, padding_value=0) TFs = pad_sequence(list(map(torch.tensor, TFs)), batch_first=True, padding_value=0) TFs = torch.LongTensor(torch.log2(TFs + 1).round().long()) DFs = pad_sequence(list(map(torch.tensor, DFs)), batch_first=True, padding_value=0) DFs = torch.LongTensor(torch.log2(DFs + 1).round().long()) return doc_tids, TFs, DFs, torch.LongTensor(y) def collate_predict(X): doc_tids, TFs, DFs = self._tokenizer.transform(X, verbose=False) doc_tids = pad_sequence(list(map(torch.LongTensor, doc_tids)), batch_first=True, padding_value=0) TFs = pad_sequence(list(map(torch.tensor, TFs)), batch_first=True, padding_value=0) TFs = torch.LongTensor(torch.log2(TFs + 1).round().long()) DFs = pad_sequence(list(map(torch.tensor, DFs)), batch_first=True, padding_value=0) DFs = torch.LongTensor(torch.log2(DFs + 1).round().long()) return doc_tids, TFs, DFs self.collate_train = collate_train self.collate_predict = collate_predict def fit(self, X_train, y_train, X_val=None, y_val=None): if X_val is None or y_val is None: pass self._tokenizer = Tokenizer(mindf=self.mindf, lan=self.lan, stopwordsSet=self.stopwords, model='sample', k=self.k, verbose=self._verbose) self._tokenizer.fit(X_train, y_train) self.maxF = int(round(np.log2(self._tokenizer.maxF + 1))) self._model = AttentionTFIDF(vocab_size=self._tokenizer.vocab_size, hiddens=self.hiddens, nclass=self._tokenizer.n_class, maxF=self.maxF, drop=self.max_drop).to(self._device) optimizer = optim.AdamW(self._model.parameters(), lr=self.lr, weight_decay=self.weight_decay) loss_func_cel = nn.CrossEntropyLoss().to(self._device) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=self.factor, patience=3, verbose=self._verbose) best = 99999. best_acc = 0. counter = 1 dl_val = DataLoader(list(zip(X_val, y_val)), batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_train, num_workers=self.n_jobs) for e in tqdm(range(self.nepochs), total=self.nepochs, disable=not self._verbose): dl_train = DataLoader(list(zip(X_train, y_train)), batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_train, num_workers=self.n_jobs) loss_train = 0. with tqdm(total=len(y_train) + len(y_val), smoothing=0., desc=f"ACC_val: {best_acc:.2} Epoch {e+1}", disable=not self._verbose) as pbar: total = 0 correct = 0 self._model.train() self._tokenizer.model = 'sample' for i, (doc_tids, TFs, DFs, y) in enumerate(dl_train): doc_tids = doc_tids.to(self._device) TFs = TFs.to(self._device) DFs = DFs.to(self._device) y = y.to(self._device) pred_docs, _, _ = self._model(doc_tids, TFs, DFs) pred_docs = torch.softmax(pred_docs, dim=1) loss = loss_func_cel(pred_docs, y) optimizer.zero_grad() loss.backward() optimizer.step() loss_train += loss.item() total += len(y) y_pred = pred_docs.argmax(axis=1) correct += (y_pred == y).sum().item() self._model.drop_ = (correct / total) * self.max_drop pbar.update(len(y)) del doc_tids, TFs del DFs, y, pred_docs del loss, y_pred loss_train = loss_train / (i + 1) total = 0 correct = 0 self._model.eval() self._tokenizer.model = 'topk' with torch.no_grad(): loss_val = 0. for i, (doc_tids, TFs, DFs, y) in enumerate(dl_val): doc_tids = doc_tids.to(self._device) TFs = TFs.to(self._device) DFs = DFs.to(self._device) y = y.to(self._device) pred_docs, _, _ = self._model(doc_tids, TFs, DFs) pred_docs = torch.softmax(pred_docs, dim=1) loss = loss_func_cel(pred_docs, y) loss_val += loss.item() total += len(y) y_pred = pred_docs.argmax(axis=1) correct += (y_pred == y).sum().item() pbar.update(len(y)) loss_val del doc_tids, TFs, DFs, y del pred_docs, loss loss_val = (loss_val / (i + 1)) scheduler.step(loss_val) if best - loss_val > 0.0001: best = loss_val counter = 1 best_acc = correct / total best_model = copy.deepcopy(self._model).to('cpu') elif counter > self.patience: break else: counter += 1 self._model = best_model.to(self._device) self._loss = best self._acc = best_acc return self def predict(self, X): if self._model is None or self._tokenizer is None: raise Exception("Not implemented yet!") self._model.eval() self._tokenizer.model = 'topk' dataloader = DataLoader(X, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_predict, num_workers=self.n_jobs) result = [] with torch.no_grad(): loss_val = 0. for i, (doc_tids, TFs, DFs) in enumerate(dataloader): doc_tids = doc_tids.to(self._device) TFs = TFs.to(self._device) DFs = DFs.to(self._device) pred_docs, _, _ = self._model(doc_tids, TFs, DFs) pred_docs = torch.softmax( pred_docs, dim=1).argmax(axis=1).cpu().detach().numpy() result.extend(list(pred_docs)) return self._tokenizer.le.inverse_transform(np.array(result)) def to(self, device): self._device = device if self._model is not None: self._model.to(self._device) return self
from tokenizer import Tokenizer from transformer.frontend import NLP import os #os.environ["CUDA_VISIBLE_DEVICES"] = "-1" num_merges = 40000 tokenization_path = f"tokenization_{num_merges}.json" input_path = 'input.txt' tokenizer = Tokenizer() if os.path.exists(tokenization_path): tokenizer.load(tokenization_path) else: tokenizer.from_file(input_path, num_merges) tokenizer.save(tokenization_path) nlp = NLP(tokenizer, maximum_position_encoding=1000, d_model=64, num_layers=5, dff=1024, num_heads=8) with open(input_path, 'r', encoding='utf-8') as f: nlp.train(f.read(), prev_tokens=128, epochs=10, evaluate_str=[ 'XD', ]) str = input() nlp.generate_text(str, length=200) #while True: # output, in_tokens, translated_tokens, attention_weights = nlp.evaluate(str)
config = run.config df = pd.read_csv('tweets.csv') target = df['is_there_an_emotion_directed_at_a_brand_or_product'] text = df['tweet_text'] fixed_text = text[pd.notnull(text)] fixed_target = target[pd.notnull(text)] w2v = {} with open("glove/glove.6B.50d.txt", "r") as lines: for line in lines: word, numbers = line.split(" ", 1) number_array = np.array(numbers.split()).astype(np.float) w2v[word] = number_array text_clf = Pipeline([ ('token', Tokenizer()), ('vect', MeanEmbeddingVectorizer(w2v)), ("extra trees", ExtraTreesClassifier(n_estimators=200)), ]) text_clf.fit(fixed_text, fixed_target) scores = cross_val_score(text_clf, fixed_text, fixed_target) print(scores) print(scores.mean()) predictions = cross_val_predict(text_clf, fixed_text, fixed_target) log(run, fixed_text, fixed_target, predictions)
def __init__(self): self.src_vocab = Tokenizer.en_vocab_create() self.trg_vocab = Tokenizer.ja_vocab_create()
import numpy as np from tqdm import tqdm import librosa from scipy.io import wavfile from sklearn.model_selection import train_test_split import hyperparameter as hp from tokenizer import Tokenizer from data_hdf5 import HDF5DatasetWriter from audio import Audio MAX_LEN_TEXT = 300 MAX_LEN_AUDIO = 1595 audio = Audio(hp) tokenizer = Tokenizer(alphabet=hp.alphabet) data_list = os.listdir(hp.data_path) tokens = [] audio_links = [] label_links = [] def process_wav(wav_path): y, sr = audio.load_wav(wav_path) mel = audio.mel_spectrogram(y) assert mel.shape[1] == audio.config.mel_channels, len(mel.shape) == 2 start_token = np.ones((1, hp.mel_channels)) * hp.mel_start_value end_token = np.ones((1, hp.mel_channels)) * hp.mel_end_value mel = np.concatenate([start_token, mel], 0)
def compute_token_ids(self): parser = Tokenizer(self.args.token_args) return parser.token2id()
def __init__(self, encoder_model_dimension: int, decoder_model_dimension: int, encoder_num_heads: list, decoder_num_heads: list, encoder_maximum_position_encoding: int, decoder_maximum_position_encoding: int, encoder_dense_blocks: int, decoder_dense_blocks: int, encoder_prenet_dimension: int, decoder_prenet_dimension: int, postnet_conv_filters: int, postnet_conv_layers: int, postnet_kernel_size: int, dropout_rate: float, mel_start_value: float, mel_end_value: float, mel_channels: int, encoder_attention_conv_filters: int = None, decoder_attention_conv_filters: int = None, encoder_attention_conv_kernel: int = None, decoder_attention_conv_kernel: int = None, encoder_feed_forward_dimension: int = None, decoder_feed_forward_dimension: int = None, decoder_prenet_dropout=0.5, max_r: int = 10, **kwargs): super(AutoregressiveTransformer, self).__init__(**kwargs) self.start_vec = tf.ones((1, mel_channels), dtype=tf.float32) * mel_start_value self.end_vec = tf.ones((1, mel_channels), dtype=tf.float32) * mel_end_value self.stop_prob_index = 2 self.max_r = max_r self.r = max_r self.mel_channels = mel_channels self.drop_n_heads = 0 self.tokenizer = Tokenizer(alphabet=hp.alphabet) self.encoder_prenet = tf.keras.layers.Embedding(self.tokenizer.vocab_size, encoder_prenet_dimension, name='Embedding') self.encoder = SelfAttentionBlocks(model_dim=encoder_model_dimension, dropout_rate=dropout_rate, num_heads=encoder_num_heads, feed_forward_dimension=encoder_feed_forward_dimension, maximum_position_encoding=encoder_maximum_position_encoding, dense_blocks=encoder_dense_blocks, conv_filters=encoder_attention_conv_filters, kernel_size=encoder_attention_conv_kernel, conv_activation='relu', name='Encoder') self.decoder_prenet = DecoderPrenet(model_dim=decoder_model_dimension, dense_hidden_units=decoder_prenet_dimension, dropout_rate=decoder_prenet_dropout, name='DecoderPrenet') self.decoder = CrossAttentionBlocks(model_dim=decoder_model_dimension, dropout_rate=dropout_rate, num_heads=decoder_num_heads, feed_forward_dimension=decoder_feed_forward_dimension, maximum_position_encoding=decoder_maximum_position_encoding, dense_blocks=decoder_dense_blocks, conv_filters=decoder_attention_conv_filters, conv_kernel=decoder_attention_conv_kernel, conv_activation='relu', conv_padding='causal', name='Decoder') self.final_proj_mel = tf.keras.layers.Dense(self.mel_channels * self.max_r, name='FinalProj') self.decoder_postnet = Postnet(mel_channels=mel_channels, conv_filters=postnet_conv_filters, conv_layers=postnet_conv_layers, kernel_size=postnet_kernel_size, name='Postnet') self.training_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.int32) ] self.forward_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), ] self.encoder_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32) ] self.decoder_signature = [ tf.TensorSpec(shape=(None, None, encoder_model_dimension), dtype=tf.float32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32), ]
for token_sequence in sequences: words.extend(token_sequence) word_counts = dict(Counter(words).most_common(max_words)) most_common_words = list(word_counts.keys()) word_ids = list(range(len(most_common_words))) vocabulary = dict(zip(most_common_words, word_ids)) return vocabulary sentences = np.genfromtxt('./tickets_QIT.txt', dtype=str, delimiter='\n') prep = TextPreprocessor(sentences) prep = QITEmailBodyCleaner(prep) prep = Tokenizer(prep, language='italian') tokens = prep.preprocess() vocabulary = build_vocabulary(tokens) unknown_token_id = max(vocabulary.values()) + 1 prep = IntegerEncoder(prep, vocabulary, unknown_token_id) prep = WordContextPairsGenerator(prep, window_length=2) word_context_pairs = prep.preprocess() target_words = [tw for (tw, cw) in word_context_pairs] context_words = [cw for (tw, cw) in word_context_pairs] np.savetxt('target_words.txt', target_words, fmt='%d') np.savetxt('context_words.txt', context_words, fmt='%d')
f_xml = xml.dom.minidom.parseString(root_str) pretty_xml_as_string = f_xml.toprettyxml(encoding="utf-8") f = open(out_file, 'wb') f.write(pretty_xml_as_string) f.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('rsd_input', type=str, help='input rsd file path or directory.') parser.add_argument('ltf_output', type=str, help='output ltf file path or directory.') t = Tokenizer() parser.add_argument( '--seg_option', default='linebreak', help="segmentation options: %s (default is linebreak)" % ', '.join(t.segmenters.keys())) parser.add_argument('--tok_option', default='unitok', help="tokenization options: %s (default is unitok)" % ', '.join(t.tokenizers.keys())) parser.add_argument('--extension', default='.rsd.txt', help="extension of rsd file") parser.add_argument('--re_segment', action='store_true', default=False,
small_diff - максимальная допустимая разность между двумя вариантами правила с наибольшей вероятностью, """ return [self.parse_sent(sentence, radius, suff_len, small_diff, process_cases) for sentence in self.make_sents(self.lemmatize(tokens))] if __name__ == "__main__": filename = os.path.join(os.path.dirname(sys.argv[0]), "test/freview.txt") trainfile = os.path.join(os.path.dirname(sys.argv[0]),"dicts/ruscorpora.txt.lemma") prepsfile = os.path.join(os.path.dirname(sys.argv[0]),"corpora/preps_stat.txt") print "STARTED:", str(datetime.now()) start = time.time() morph = get_morph(os.path.join(os.path.dirname(sys.argv[0]),"pydicts").decode("UTF8")) # Подгружаем русский словарь morph_simple = get_morph(os.path.join(os.path.dirname(sys.argv[0]),"pydicts").decode("UTF8"), check_prefixes=False) # Подгружаем русский словарь - 2 tok = Tokenizer() # Подгружаем токенизатор dater = Dater() # Подгружаем обработчик дат tagger = Tagger(morph, morph_simple, dater) # Подгружаем тэггер #t = time.time() #tagger.prepare_cases(trainfile) #print "Cases prepared! It took", time.time() - t #t = time.time() #tagger.train_cases(trainfile + ".cases") # Обучаем тэггер падежам #print "Cases trained! It took", time.time() - t tagger.prepare_corpus(trainfile, 3) tagger.prepare_corpus(trainfile, 4) tagger.prepare_corpus(trainfile, 5) print "Corpus prepared!" tagger.train(trainfile + ".03.suffs", 3) # Обучаем тэггер суффиксам tagger.load_statistics(trainfile, 3) # Загружаем суффиксную статистику #tagger.dump_preps(prepsfile) # Выписываем правила падежей в зависимости от предлогов в текстовый файл