def main(argv): if len(argv) < 3: usage() model_dir = sys.argv[1] data_file = sys.argv[2] # Load tokenizer tokenizer = WordTokenizer() tokenizer.load(os.path.join(model_dir, 'tokenizer')) # Load classifier classifier = ProductClassifier() classifier.load(os.path.join(model_dir, 'classifier')) # Load named entity recognizer ner = ProductNER() ner.load(os.path.join(model_dir, 'ner')) with open(data_file, 'rb') as f: reader = csv.DictReader(f) outfile = open( '.'.join(data_file.split('.')[:-1] + ['processed', 'csv']), 'wb') writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames + ['category', 'brand']) writer.writeheader() count = 0 for row in reader: count += 1 processed_row = process(row, tokenizer, classifier, ner) print(processed_row) writer.writerow(processed_row)
def load_models(model_dir): # Load tokenizer tokenizer = WordTokenizer() tokenizer.load(os.path.join(model_dir, 'tokenizer')) # Load classifier classifier = ProductClassifier() classifier.load(os.path.join(model_dir, 'classifier')) # Load named entity recognizer ner = ProductNER() ner.load(os.path.join(model_dir, 'ner')) return tokenizer, classifier, ner
def main(argv): if len(argv) < 2: usage() # Fetch data texts, categories = [], [] with open(sys.argv[1], 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "brand", "description", "categories"]) count = 0 for row in reader: count += 1 text, category = row['title'] + ' ' + row['description'], row[ 'categories'].split(' / ')[0] texts.append(text) categories.append(category) if count >= MAX_TEXTS: break print('Processed %s texts.' % len(texts)) # Tokenize texts tokenizer = WordTokenizer() tokenizer.load() data = tokenizer.tokenize(texts) # Get labels from classifier classifier = ProductClassifier() labels = classifier.get_labels(categories) # Compile classifier network and train classifier.compile(tokenizer) classifier.train(data, labels)
def main(argv): if len(argv) < 2: usage() # Fetch data texts, categories = [], [] with open(sys.argv[1], 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "brand", "description", "categories"]) count = 0 for row in reader: count += 1 # TODO change here what we train on, and what categories are used text, category = row['title'], row['categories'].split(' / ')[0] texts.append(text) categories.append(category) if count >= MAX_TEXTS: break print(('Processed %s texts.' % len(texts))) tmpx, tmpy = [], [] c = Counter(categories) for x, y in zip(texts, categories): if c[y] > 200: tmpx.append(x) tmpy.append(y) texts = tmpx categories = tmpy print(Counter(tmpy)) # Tokenize texts tokenizer = WordTokenizer() tokenizer.load() data = tokenizer.tokenize(texts) # Get labels from classifier classifier = ProductClassifier() labels = classifier.get_labels(categories) # Compile classifier network and train classifier.compile(tokenizer) classifier.train(data, labels)
import sys, os, csv import numpy as np from operator import itemgetter from tokenizer import WordTokenizer from classifier import ProductClassifier model_dir = './models' tokenizer = WordTokenizer() tokenizer.load(os.path.join(model_dir, 'tokenizer')) # Load classifier classifier = ProductClassifier() classifier.load(os.path.join(model_dir, 'classifier')) data = tokenizer.tokenize(["Cambridge wall calender"]) classScores = classifier.classify(data)[0] print(classScores) bestValIdx = np.argmax(classScores.values()) bestVal = classScores.values()[bestValIdx] bestClass = list(classScores)[bestValIdx] print(bestVal, bestClass)