Exemplo n.º 1
0
 def save(self):
     micro_material = {}
     for book in self.cook_books:
         for materials in book['material']:
             name = self.clean(materials['name'].strip())
             if name not in self.material:
                 self.material[name] = []
             amount = self.clean(self.unit_to_en(materials['amount'].strip().lower()))
             amount = self.reverse_num(amount)
             if re.match(r'.*\d+.*',amount):
                 self.material[name].append(amount)
             else:
                 if name not in micro_material:
                     micro_material[name] = []
                 micro_material[name].append(amount)
     
     _material = {}
     for n in self.material:
         if len(self.material[n]) > 0:
             _material[n] = self.material[n]
             m = Material()
             m.name = n
             amounts = [pseg.cut(amount) for amount in self.material[n]]
             amount_list = []
             m.amount = []
             for a in amounts:
                 amount = []
                 for _a in a:
                     amount.append({'word':_a.word,'flag':_a.flag})
                 m.amount.append(amount)
             m.save()
     self.material = _material
     #保存数据
     open(CONFIG['DATA_PATH']+'materials.json','w').write(json.dumps(self.material))
Exemplo n.º 2
0
    def print_name(row):
        #if row[cl+1]==0 and row[cl+2]==0:return
        if (row[cl + 1] == row[cl + 2] == row[cl + 3] == row[cl + 4] ==
                1) or row[cl + 5] > 3:
            Material.combine(row[0], row[1], row[8])
            #return
        #return

        elif (row[cl + 1] >= 0.5
              or row[cl + 2] >= 0.5) and (row[cl + 3] >= 0.5
                                          or row[cl + 4] >= 0.5):
            #try:
            m1 = Material.objects(id=row[0]).only('name')
            if m1:
                m1 = m1.get(0)
            m2 = Material.objects(id=row[1]).only('name')
            if m2:
                m2 = m2.get(0)
            if not m1 or not m2: return
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            Material.combine(row[0], row[1], row[8], check_dict=True)
            #print row[8:]
            #print '-'*30

            #print row[cl+1:]
        #elif (row[cl+3]>=0.5 and row[cl+4]==1) or (row[cl+3]==1 and row[cl+4]>=0.5):
        elif row[cl + 3] >= 0.5 and row[cl + 4] >= 0.5:
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            Material.combine(row[0], row[1], row[8], check_dict=True)
            #pass
            #print row[8:]
            #print '*'*50
        else:
            pass
Exemplo n.º 3
0
    def print_name(row):
        #if row[cl+1]==0 and row[cl+2]==0:return
        if (row[cl+1] == row[cl+2] == row[cl+3] == row[cl+4] ==1) or row[cl+5]>3:
            Material.combine(row[0],row[1],row[8])
            #return
        #return

        elif (row[cl+1]>=0.5 or row[cl+2]>=0.5) and (row[cl+3]>=0.5 or row[cl+4]>=0.5):
            #try:
            m1 = Material.objects(id=row[0]).only('name')
            if m1:
                m1 = m1.get(0)
            m2 = Material.objects(id=row[1]).only('name')
            if m2:
                m2 = m2.get(0)
            if not m1 or not m2:return
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            Material.combine(row[0],row[1],row[8],check_dict=True)
            #print row[8:]
            #print '-'*30
            
            #print row[cl+1:]
        #elif (row[cl+3]>=0.5 and row[cl+4]==1) or (row[cl+3]==1 and row[cl+4]>=0.5):
        elif row[cl+3]>=0.5 and row[cl+4]>=0.5 :
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            Material.combine(row[0],row[1],row[8],check_dict=True)
            #pass
            #print row[8:]
            #print '*'*50
        else:
            pass
Exemplo n.º 4
0
    def save(self):
        micro_material = {}
        for book in self.cook_books:
            for materials in book['material']:
                name = self.clean(materials['name'].strip())
                if name not in self.material:
                    self.material[name] = []
                amount = self.clean(
                    self.unit_to_en(materials['amount'].strip().lower()))
                amount = self.reverse_num(amount)
                if re.match(r'.*\d+.*', amount):
                    self.material[name].append(amount)
                else:
                    if name not in micro_material:
                        micro_material[name] = []
                    micro_material[name].append(amount)

        _material = {}
        for n in self.material:
            if len(self.material[n]) > 0:
                _material[n] = self.material[n]
                m = Material()
                m.name = n
                amounts = [pseg.cut(amount) for amount in self.material[n]]
                amount_list = []
                m.amount = []
                for a in amounts:
                    amount = []
                    for _a in a:
                        amount.append({'word': _a.word, 'flag': _a.flag})
                    m.amount.append(amount)
                m.save()
        self.material = _material
        #保存数据
        open(CONFIG['DATA_PATH'] + 'materials.json',
             'w').write(json.dumps(self.material))
Exemplo n.º 5
0
from config import CONFIG
from orm.Material import Material

import sys

reload(sys)
sys.setdefaultencoding("utf-8")
import json
import pandas as pd
import re
import nltk
import jieba
import jieba.posseg as pseg

data = Material.objects()

units = []
_units = {}
all_amount = {}
known_units = set([u"个", u"种", u"個", u"大个"])
for material in data:
    amounts = material.amount

    if material.name not in all_amount:
        all_amount[material.name] = {"name": material.name, "amount": 0.0, "unit": set(), "count": 0}

    for amount in amounts:
        # 对分词成3个以上的,暂时忽略,如"1/4个",会拆成 "1" "/" "4" "个"
        if len(amount) > 2:
            continue
Exemplo n.º 6
0
 def __init__(self):
     Material.drop_collection()
     print 'Initiating complete ! Material collection droped !!'
Exemplo n.º 7
0
def find_similar():
    #similar = {}
    similar_grid = []
    global material_names
    pair_done = set()
    for mid1 in material_names:
        for mid2 in material_names:

            #对称词对跳过
            if (mid2, mid1) in pair_done: continue
            pair_done.add((mid1, mid2))

            char_inter = material_names[mid1]['char'].intersection(
                material_names[mid2]['char'])

            if mid1 == mid2 or len(char_inter) == 0: continue

            word_inter = material_names[mid1]['word'].intersection(
                material_names[mid2]['word'])

            #不要动这行,以下计算基于这些元素顺序取值
            res = [
                len(word_inter),
                len(char_inter),
                len(material_names[mid1]['word']),
                len(material_names[mid2]['word']),
                len(material_names[mid1]['char']),
                len(material_names[mid2]['char']), ''.join(char_inter)
            ]
            #res += [len(material_names[mid1]['cut']),len(material_names[mid2]['cut'])]
            res += [
                Material.objects(id=mid1).only('name')[0].name,
                Material.objects(id=mid2).only('name')[0].name
            ]
            #res += []
            #similar[(mid1,mid2)] = tuple(res)
            similar_grid.append([mid1, mid2] + res)
    #for s in similar:
    #    print '%s,%s'%(s,similar[s])
    df = pd.DataFrame(similar_grid)
    #df = pd.DataFrame(similar_grid[:1000])

    cl = len(similar_grid[0]) - 1
    df[cl + 1] = df[2] / df[4]  #m1 word ratio
    df[cl + 2] = df[2] / df[5]  #m2 word ratio
    df[cl + 3] = df[3] / df[6]  #m1 char ratio
    df[cl + 4] = df[3] / df[7]  #m2 char ratio
    df[cl + 5] = (df[cl + 1] + df[cl + 2]) + (df[cl + 3] + df[cl + 4])
    df = df.sort(columns=15, ascending=False)

    df.to_csv('res.csv')

    #sys.exit()
    #df[8] = df[6]+df[7]
    def print_name(row):
        #if row[cl+1]==0 and row[cl+2]==0:return
        if (row[cl + 1] == row[cl + 2] == row[cl + 3] == row[cl + 4] ==
                1) or row[cl + 5] > 3:
            Material.combine(row[0], row[1], row[8])
            #return
        #return

        elif (row[cl + 1] >= 0.5
              or row[cl + 2] >= 0.5) and (row[cl + 3] >= 0.5
                                          or row[cl + 4] >= 0.5):
            #try:
            m1 = Material.objects(id=row[0]).only('name')
            if m1:
                m1 = m1.get(0)
            m2 = Material.objects(id=row[1]).only('name')
            if m2:
                m2 = m2.get(0)
            if not m1 or not m2: return
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            Material.combine(row[0], row[1], row[8], check_dict=True)
            #print row[8:]
            #print '-'*30

            #print row[cl+1:]
        #elif (row[cl+3]>=0.5 and row[cl+4]==1) or (row[cl+3]==1 and row[cl+4]>=0.5):
        elif row[cl + 3] >= 0.5 and row[cl + 4] >= 0.5:
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            Material.combine(row[0], row[1], row[8], check_dict=True)
            #pass
            #print row[8:]
            #print '*'*50
        else:
            pass
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            #pass
            #print row[8:]
            #print '-'*30

    df.apply(print_name, axis=1)
Exemplo n.º 8
0
    import os, sys
    sys.path.append(os.environ['DOUGUO_BASE'])

from config import CONFIG
from orm.Material import Material

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pandas as pd
import jieba
jieba.load_userdict(CONFIG['DICT_PATH'] + "jieba_user.dict")
jieba.enable_parallel(4)
import jieba.posseg as pseg

data = Material.objects().only('name')  #[:2]

#[{id:,:word:,char:}]
material_names = {}


def init_grams():
    global material_names
    global data
    for m in data:
        word = []
        mn = {}
        mn['word'] = set()
        mn['char'] = set()
        name = m.name.split(u'的')[-1]
        mn['origin'] = name
Exemplo n.º 9
0
    import os,sys
    sys.path.append(os.environ['DOUGUO_BASE'])

from config import CONFIG
from orm.Material import Material

import json
import pandas as pd
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Droid Sans Fallback']

#_materials = json.loads(open('../data/materials.json','r').read())
_materials = Material.objects()
#materials = [{'name':u'%s'%m,'count':len(_materials[m])} for m in _materials if len(_materials[m])>=10]
materials = [{'name':u'%s'%m.name,'count':len(m.amount)} for m in _materials if len(m.amount)>=10]

df = pd.DataFrame(materials).sort(columns='count',ascending=False)
#导出成json
open('result/material_count.json','w').write(json.dumps(df.to_dict(outtype='records')))

fig = df.plot(kind='barh',legend=False,fontsize=6).get_figure()
plt.yticks(xrange(df.count()[0]), df['name'])
plt.xlabel(u'使用次数')
plt.ylabel(u'食材')
plt.title(u'全部作品中各食材的使用次数')
fig.savefig('out.png')
Exemplo n.º 10
0
from config import CONFIG
from orm.Material import Material

import json
import pandas as pd
import sys

reload(sys)
sys.setdefaultencoding('utf-8')

import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Droid Sans Fallback']

#_materials = json.loads(open('../data/materials.json','r').read())
_materials = Material.objects()
#materials = [{'name':u'%s'%m,'count':len(_materials[m])} for m in _materials if len(_materials[m])>=10]
materials = [{
    'name': u'%s' % m.name,
    'count': len(m.amount)
} for m in _materials if len(m.amount) >= 10]

df = pd.DataFrame(materials).sort(columns='count', ascending=False)
#导出成json
open('result/material_count.json',
     'w').write(json.dumps(df.to_dict(outtype='records')))

fig = df.plot(kind='barh', legend=False, fontsize=6).get_figure()
plt.yticks(xrange(df.count()[0]), df['name'])
plt.xlabel(u'使用次数')
plt.ylabel(u'食材')
Exemplo n.º 11
0
    sys.path.append(os.environ['DOUGUO_BASE'])

from config import CONFIG
from orm.Material import Material

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import json
import pandas as pd
import re
import nltk
import jieba
import jieba.posseg as pseg

data = Material.objects()

units = []
_units = {}
all_amount = {}
known_units = set([u'个', u'种', u'個', u'大个'])
for material in data:
    amounts = material.amount

    if material.name not in all_amount:
        all_amount[material.name] = {
            'name': material.name,
            'amount': 0.0,
            'unit': set(),
            'count': 0
        }
Exemplo n.º 12
0
def find_similar():
    #similar = {}
    similar_grid = []
    global material_names 
    pair_done = set()
    for mid1 in material_names:
        for mid2 in material_names:

            #对称词对跳过
            if (mid2,mid1) in pair_done:continue
            pair_done.add((mid1,mid2))


            char_inter = material_names[mid1]['char'].intersection(material_names[mid2]['char'])

            if mid1 == mid2 or len(char_inter) == 0 :continue

            word_inter = material_names[mid1]['word'].intersection(material_names[mid2]['word'])

            #不要动这行,以下计算基于这些元素顺序取值
            res = [len(word_inter),len(char_inter),len(material_names[mid1]['word']),len(material_names[mid2]['word']),len(material_names[mid1]['char']),len(material_names[mid2]['char']),''.join(char_inter)]
            #res += [len(material_names[mid1]['cut']),len(material_names[mid2]['cut'])]
            res += [Material.objects(id=mid1).only('name')[0].name, Material.objects(id=mid2).only('name')[0].name]
            #res += []
            #similar[(mid1,mid2)] = tuple(res)
            similar_grid.append([mid1,mid2]+res)
    #for s in similar:
    #    print '%s,%s'%(s,similar[s])
    df = pd.DataFrame(similar_grid)
    #df = pd.DataFrame(similar_grid[:1000])

    cl = len(similar_grid[0])-1
    df[cl+1] = df[2]/df[4]#m1 word ratio
    df[cl+2] = df[2]/df[5]#m2 word ratio
    df[cl+3] = df[3]/df[6]#m1 char ratio
    df[cl+4] = df[3]/df[7]#m2 char ratio
    df[cl+5] = (df[cl+1]+df[cl+2]) + (df[cl+3]+df[cl+4])
    df = df.sort(columns=15,ascending=False)

    df.to_csv('res.csv')
    #sys.exit()
    #df[8] = df[6]+df[7]
    def print_name(row):
        #if row[cl+1]==0 and row[cl+2]==0:return
        if (row[cl+1] == row[cl+2] == row[cl+3] == row[cl+4] ==1) or row[cl+5]>3:
            Material.combine(row[0],row[1],row[8])
            #return
        #return

        elif (row[cl+1]>=0.5 or row[cl+2]>=0.5) and (row[cl+3]>=0.5 or row[cl+4]>=0.5):
            #try:
            m1 = Material.objects(id=row[0]).only('name')
            if m1:
                m1 = m1.get(0)
            m2 = Material.objects(id=row[1]).only('name')
            if m2:
                m2 = m2.get(0)
            if not m1 or not m2:return
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            Material.combine(row[0],row[1],row[8],check_dict=True)
            #print row[8:]
            #print '-'*30
            
            #print row[cl+1:]
        #elif (row[cl+3]>=0.5 and row[cl+4]==1) or (row[cl+3]==1 and row[cl+4]>=0.5):
        elif row[cl+3]>=0.5 and row[cl+4]>=0.5 :
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            Material.combine(row[0],row[1],row[8],check_dict=True)
            #pass
            #print row[8:]
            #print '*'*50
        else:
            pass
            #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name
            #pass
            #print row[8:]
            #print '-'*30
    df.apply(print_name,axis=1)
Exemplo n.º 13
0
    import os,sys
    sys.path.append(os.environ['DOUGUO_BASE'])

from config import CONFIG
from orm.Material import Material

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pandas as pd
import jieba
jieba.load_userdict(CONFIG['DICT_PATH']+"jieba_user.dict") 
jieba.enable_parallel(4)
import jieba.posseg as pseg

data = Material.objects().only('name')#[:2]

#[{id:,:word:,char:}]
material_names = {}

def init_grams():
    global material_names 
    global data
    for m in data:
        word = []
        mn = {}
        mn['word'] = set()
        mn['char'] = set()
        name = m.name.split(u'的')[-1]
        mn['origin'] = name
        for pair in pseg.cut(name):
Exemplo n.º 14
0
 def __init__(self):
     Material.drop_collection()
     print 'Initiating complete ! Material collection droped !!'