Exemplo n.º 1
0
def computeSkillIndexes(profile, skills):
    """Compute skill index values"""
    skillindex = dict()
    for item in profile['skills']:
        skillFlag = False
        for category, topics in skills.items():
            if item in topics: # If the skill is already present in one of our lists
                skillindex[category] = skillindex.get(category, 0) + 100.0/len(profile['skills'])
                skillFlag = True
        if not skillFlag:
            # if not, get related skills from the relevant linkedin page
            category1,category2,category3,category4 = categorizer.categorize(item)
            catset = set([category1, category2, category3,category4])
            for cat in catset:
                skills[cat].append(item)
                skillindex[cat] = skillindex.get(cat, 0) + 100.0/len(profile['skills'])
                #writeback(cat, item)
            #relatedskills = scraper.extractRelatedSkills(item)
            #for relskill in relatedskills:
            #    for category, topics in skills.items():
            #        if relskill in topics:
            #            skillFlag = False
            #    else:
            #        # if none of the related skills are present in any of the lists,
            #        # go the extra mile and make a wild guess on which category it might belong to
            #       if not skillFlag:
            #            category1,category2,category3,category4 = categorizer.categorize(relskill)
            #            catset = set([category1, category2, category3,category4])
            #            print relskill, ':', catset
            #            for cat in catset:
            #                skills[cat].append(relskill)
            #                #writeback(cat, relskill)
    return skillindex
Exemplo n.º 2
0
def doc_categorize(fsort):
    cat_prob = [0,0,0,0]
    for term in fsort:
        print term[0]
    for kw in fsort:
        term_corr = [kw[1]*term for term in categorizer.categorize(kw[0])]
        cat_prob = map(operator.add, cat_prob, term_corr)
        print 'cumulative probability :', cat_prob
    return cat_prob
Exemplo n.º 3
0
def run_server(kw_list,log):
	global source_probs
	cat_list = {}
	cat_list['general'] = wiki_list = [("Wikipedia", "site:wikipedia.org"), ("Citizendium", "site:citizendium.org"), ("Britannica", "site:britannica.com")]
	cat_list['technology'] = [("gizmodo","gizmodo"), ("theverge","theverge"), ("engadget","engadget")]

	source_probs['general'] = phase_one.phase1_update(source_probs['general'], cat_list['general'], [item[0] for item in kw_list], 1, True, log)

	pickle.dump( source_probs, open( "source_probabilities.sp", "wb" ) )

	for kw in kw_list:
		kw_category = string.lower(cat_mod.categorize(kw[0]))
		print kw, kw_category
		get_uri(kw,cat_list['general'], source_probs['general'], 2)
		if kw_category in cat_list.keys():
			get_uri(kw,cat_list[kw_category], source_probs['technology'], 2)
	print_uri()
Exemplo n.º 4
0
def process_davivienda_message(message):
    print('Davivienda: processing message')
    amount = re.sub(r'[^\d.]', '', nth(lines(message.string), 6))
    category_type = nth(lines(message.string), 7)
    desc = trim(nth(split(nth(lines(message.string), 8), ':'), 1))
    if has_substr(upper_case(desc), 'PSE'):
        print('Davivienda: Ignored PSE payment')
        return None, None, None
    if has_substr(lower_case(category_type), 'deposito') or has_substr(
            lower_case(category_type), 'abono'):
        category_type = moneylover.CATEGORY_TYPE['income']
        category_item = 'Salary' if has_substr(
            upper_case(desc), 'ACH GNB SUDAMERIS') else 'Others'
    else:
        category_type = moneylover.CATEGORY_TYPE['expense']
        category_item = categorize(desc)
    return amount, {'type': category_type, 'item': category_item}, desc
Exemplo n.º 5
0
def process_pse_message(message):
    print('PSE(davivienda): processing message')
    data = lines(last(message.table.table.find_all('span')))
    desc = re.sub(r'<[^<>]*>', '', nth(data))
    amount = re.sub(r'[^\d,]', '', nth(data, 1)).replace(',', '.')
    is_visa = has_substr(lower_case(desc), 'credito visa')
    if is_visa:
        visa_category_type = moneylover.CATEGORY_TYPE['income']
        visa_category_item = 'Payment'
    category_type = moneylover.CATEGORY_TYPE['expense']
    category_item = categorize(desc)
    return amount, {
        'type': category_type,
        'item': category_item
    }, desc, {
        'type': visa_category_type,
        'item': visa_category_item
    } if is_visa else None
Exemplo n.º 6
0
def computeSkillIndexes(profile, skills):
    """Compute skill index values"""
    skillindex = dict()
    for item in profile['skills']:
        skillFlag = False
        for category, topics in skills.items():
            if item in topics:  # If the skill is already present in one of our lists
                skillindex[category] = skillindex.get(
                    category, 0) + 100.0 / len(profile['skills'])
                skillFlag = True
        if not skillFlag:
            # if not, get related skills from the relevant linkedin page
            category1, category2, category3, category4 = categorizer.categorize(
                item)
            catset = set([category1, category2, category3, category4])
            for cat in catset:
                skills[cat].append(item)
                skillindex[cat] = skillindex.get(
                    cat, 0) + 100.0 / len(profile['skills'])
                #writeback(cat, item)
            #relatedskills = scraper.extractRelatedSkills(item)
            #for relskill in relatedskills:
            #    for category, topics in skills.items():
            #        if relskill in topics:
            #            skillFlag = False
            #    else:
            #        # if none of the related skills are present in any of the lists,
            #        # go the extra mile and make a wild guess on which category it might belong to
            #       if not skillFlag:
            #            category1,category2,category3,category4 = categorizer.categorize(relskill)
            #            catset = set([category1, category2, category3,category4])
            #            print relskill, ':', catset
            #            for cat in catset:
            #                skills[cat].append(relskill)
            #                #writeback(cat, relskill)
    return skillindex
Exemplo n.º 7
0
from categorizer import categorize
from datetime import datetime


def Average(lst):
    return sum(lst) / len(lst)


if __name__ == '__main__':
    tags = [
        'XMR/BTC', 'BTC Options OI held', 'DOGEBTC (TradingView)', 'BTCB',
        'get-more-btc', 'filbtc', 'BTCADA (TradingView)',
        'bitcoin cash prices', 'bitcoin btc/btcp makes bitcoin cash increase',
        'bitcoin cash increase'
    ]
    start_time = datetime.now()
    result = categorize(tags)
    result_time = (datetime.now() - start_time).microseconds / 1000
    print(f"Time to categorize {len(tags)} Tags : ", result_time,
          "milliseconds")
    print("Time per Tag : ", result_time / len(tags), "milliseconds")
    print("tags :\n", tags)
    print("categories :\n", result)
Exemplo n.º 8
0
#all in one place
from categorizer import categorize

categorize(
    "/content/drive/My Drive/Amazon Review Data/Office_Products_5.json.gz",
    prefix='office')
categorize('/content/drive/My Drive/Amazon Review Data/Books_5.json.gz',
           prefix='books')
categorize(
    '/content/drive/My Drive/Amazon Review Data/Cell_Phones_and_Accessories_5.json.gz',
    prefix='cell_phone')
categorize(
    '/content/drive/My Drive/Amazon Review Data/Clothing_Shoes_and_Jewelry_5.json.gz',
    prefix='clothing_shoe_jewellery')
categorize('/content/drive/My Drive/Amazon Review Data/Electronics_5.json.gz',
           prefix='Electronics')
categorize('/content/drive/My Drive/Amazon Review Data/Kindle_Store_5.json.gz',
           prefix='Kindle')
categorize(
    '/content/drive/My Drive/Amazon Review Data/Movies_and_TV_5.json.gz',
    prefix='Movies')
categorize(
    '/content/drive/My Drive/Amazon Review Data/Sports_and_Outdoors_5.json.gz',
    prefix='Sports')

delivery_data_paths = [
    'drive/My Drive/CSV/office_delivery.csv',
    'drive/My Drive/CSV/books_delivery.csv',
    'drive/My Drive/CSV/cell_phone_delivery.csv',
    'drive/My Drive/CSV/clothing_shoe_jewellery_delivery.csv',
    'drive/My Drive/CSV/Electronics_delivery.csv',
Exemplo n.º 9
0
def process_scotiabank_message(message):
    print('Scotiabank: processing message')
    desc = nth(message.table.find_all('p'), 3).string
    amount = re.sub(r'[^\d.]', '', nth(message.table.find_all('p'), 4).string)
    category_item = categorize(desc)
    return amount, {'type': 'EXPENSE', 'item': category_item}, desc