예제 #1
0
def main(argv):
    if len(argv) < 2:
        usage()

    # Fetch data
    texts, categories = [], []
    with open(sys.argv[1], 'rb') as f:
        reader = csv.DictReader(
            f, fieldnames=["title", "brand", "description", "categories"])
        count = 0
        for row in reader:
            count += 1
            text, category = row['title'] + ' ' + row['description'], row[
                'categories'].split(' / ')[0]
            texts.append(text)
            categories.append(category)
            if count >= MAX_TEXTS:
                break
    print('Processed %s texts.' % len(texts))

    # Tokenize texts
    tokenizer = WordTokenizer()
    tokenizer.load()
    data = tokenizer.tokenize(texts)

    # Get labels from classifier
    classifier = ProductClassifier()
    labels = classifier.get_labels(categories)

    # Compile classifier network and train
    classifier.compile(tokenizer)
    classifier.train(data, labels)
예제 #2
0
def main(argv):
    if len(argv) < 2:
        usage()

    # Fetch data
    texts, tags = [], []
    with open(sys.argv[1], 'rb') as f:
        reader = csv.DictReader(
            f,
            fieldnames=["title", "brand", "description", "categories", "tags"])
        count = 0
        for row in reader:
            count += 1
            text, tag_set = row['title'], row['tags'].split(' ')[:-1]
            texts.append(text)
            tags.append(tag_set)
            if count >= MAX_TEXTS:
                break
    print('Processed %s texts.' % len(texts))

    # Tokenize texts
    tokenizer = WordTokenizer()
    tokenizer.load()
    data = tokenizer.tokenize(texts)

    # Get labels from NER
    ner = ProductNER()
    labels = ner.get_labels(tags)

    # Compile NER network and train
    ner.compile(tokenizer)
    ner.train(data, labels)
예제 #3
0
def main(argv):
    if len(argv) < 3:
        usage()
    model_dir = sys.argv[1]
    data_file = sys.argv[2]

    # Load tokenizer
    tokenizer = WordTokenizer()
    tokenizer.load(os.path.join(model_dir, 'tokenizer'))

    # Load classifier
    classifier = ProductClassifier()
    classifier.load(os.path.join(model_dir, 'classifier'))

    # Load named entity recognizer
    ner = ProductNER()
    ner.load(os.path.join(model_dir, 'ner'))

    with open(data_file, 'rb') as f:
        reader = csv.DictReader(f)
        outfile = open(
            '.'.join(data_file.split('.')[:-1] + ['processed', 'csv']), 'wb')
        writer = csv.DictWriter(outfile,
                                fieldnames=reader.fieldnames +
                                ['category', 'brand'])
        writer.writeheader()
        count = 0
        for row in reader:
            count += 1
            processed_row = process(row, tokenizer, classifier, ner)
            print(processed_row)
            writer.writerow(processed_row)
예제 #4
0
def load_models(model_dir):
    # Load tokenizer
    tokenizer = WordTokenizer()
    tokenizer.load(os.path.join(model_dir, 'tokenizer'))

    # Load classifier
    classifier = ProductClassifier()
    classifier.load(os.path.join(model_dir, 'classifier'))

    # Load named entity recognizer
    ner = ProductNER()
    ner.load(os.path.join(model_dir, 'ner'))

    return tokenizer, classifier, ner
예제 #5
0
def main(argv):
    if len(argv) < 2:
        usage()

    # Fetch data
    texts, categories = [], []
    with open(sys.argv[1], 'rb') as f:
        reader = csv.DictReader(
            f, fieldnames=["title", "brand", "description", "categories"])
        count = 0
        for row in reader:
            count += 1
            # TODO change here what we train on, and what categories are used
            text, category = row['title'], row['categories'].split(' / ')[0]
            texts.append(text)
            categories.append(category)
            if count >= MAX_TEXTS:
                break
    print(('Processed %s texts.' % len(texts)))

    tmpx, tmpy = [], []
    c = Counter(categories)
    for x, y in zip(texts, categories):
        if c[y] > 200:
            tmpx.append(x)
            tmpy.append(y)

    texts = tmpx
    categories = tmpy

    print(Counter(tmpy))

    # Tokenize texts
    tokenizer = WordTokenizer()
    tokenizer.load()
    data = tokenizer.tokenize(texts)

    # Get labels from classifier
    classifier = ProductClassifier()
    labels = classifier.get_labels(categories)

    # Compile classifier network and train
    classifier.compile(tokenizer)
    classifier.train(data, labels)
예제 #6
0
import sys, os, csv
import numpy as np
from operator import itemgetter
from tokenizer import WordTokenizer
from classifier import ProductClassifier
model_dir = './models'

tokenizer = WordTokenizer()
tokenizer.load(os.path.join(model_dir, 'tokenizer'))

# Load classifier
classifier = ProductClassifier()
classifier.load(os.path.join(model_dir, 'classifier'))

data = tokenizer.tokenize(["Cambridge wall calender"])
classScores = classifier.classify(data)[0]
print(classScores)
bestValIdx = np.argmax(classScores.values())
bestVal = classScores.values()[bestValIdx]
bestClass = list(classScores)[bestValIdx]
print(bestVal, bestClass)