def save(self): micro_material = {} for book in self.cook_books: for materials in book['material']: name = self.clean(materials['name'].strip()) if name not in self.material: self.material[name] = [] amount = self.clean(self.unit_to_en(materials['amount'].strip().lower())) amount = self.reverse_num(amount) if re.match(r'.*\d+.*',amount): self.material[name].append(amount) else: if name not in micro_material: micro_material[name] = [] micro_material[name].append(amount) _material = {} for n in self.material: if len(self.material[n]) > 0: _material[n] = self.material[n] m = Material() m.name = n amounts = [pseg.cut(amount) for amount in self.material[n]] amount_list = [] m.amount = [] for a in amounts: amount = [] for _a in a: amount.append({'word':_a.word,'flag':_a.flag}) m.amount.append(amount) m.save() self.material = _material #保存数据 open(CONFIG['DATA_PATH']+'materials.json','w').write(json.dumps(self.material))
def print_name(row): #if row[cl+1]==0 and row[cl+2]==0:return if (row[cl + 1] == row[cl + 2] == row[cl + 3] == row[cl + 4] == 1) or row[cl + 5] > 3: Material.combine(row[0], row[1], row[8]) #return #return elif (row[cl + 1] >= 0.5 or row[cl + 2] >= 0.5) and (row[cl + 3] >= 0.5 or row[cl + 4] >= 0.5): #try: m1 = Material.objects(id=row[0]).only('name') if m1: m1 = m1.get(0) m2 = Material.objects(id=row[1]).only('name') if m2: m2 = m2.get(0) if not m1 or not m2: return #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name Material.combine(row[0], row[1], row[8], check_dict=True) #print row[8:] #print '-'*30 #print row[cl+1:] #elif (row[cl+3]>=0.5 and row[cl+4]==1) or (row[cl+3]==1 and row[cl+4]>=0.5): elif row[cl + 3] >= 0.5 and row[cl + 4] >= 0.5: #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name Material.combine(row[0], row[1], row[8], check_dict=True) #pass #print row[8:] #print '*'*50 else: pass
def print_name(row): #if row[cl+1]==0 and row[cl+2]==0:return if (row[cl+1] == row[cl+2] == row[cl+3] == row[cl+4] ==1) or row[cl+5]>3: Material.combine(row[0],row[1],row[8]) #return #return elif (row[cl+1]>=0.5 or row[cl+2]>=0.5) and (row[cl+3]>=0.5 or row[cl+4]>=0.5): #try: m1 = Material.objects(id=row[0]).only('name') if m1: m1 = m1.get(0) m2 = Material.objects(id=row[1]).only('name') if m2: m2 = m2.get(0) if not m1 or not m2:return #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name Material.combine(row[0],row[1],row[8],check_dict=True) #print row[8:] #print '-'*30 #print row[cl+1:] #elif (row[cl+3]>=0.5 and row[cl+4]==1) or (row[cl+3]==1 and row[cl+4]>=0.5): elif row[cl+3]>=0.5 and row[cl+4]>=0.5 : #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name Material.combine(row[0],row[1],row[8],check_dict=True) #pass #print row[8:] #print '*'*50 else: pass
def save(self): micro_material = {} for book in self.cook_books: for materials in book['material']: name = self.clean(materials['name'].strip()) if name not in self.material: self.material[name] = [] amount = self.clean( self.unit_to_en(materials['amount'].strip().lower())) amount = self.reverse_num(amount) if re.match(r'.*\d+.*', amount): self.material[name].append(amount) else: if name not in micro_material: micro_material[name] = [] micro_material[name].append(amount) _material = {} for n in self.material: if len(self.material[n]) > 0: _material[n] = self.material[n] m = Material() m.name = n amounts = [pseg.cut(amount) for amount in self.material[n]] amount_list = [] m.amount = [] for a in amounts: amount = [] for _a in a: amount.append({'word': _a.word, 'flag': _a.flag}) m.amount.append(amount) m.save() self.material = _material #保存数据 open(CONFIG['DATA_PATH'] + 'materials.json', 'w').write(json.dumps(self.material))
from config import CONFIG from orm.Material import Material import sys reload(sys) sys.setdefaultencoding("utf-8") import json import pandas as pd import re import nltk import jieba import jieba.posseg as pseg data = Material.objects() units = [] _units = {} all_amount = {} known_units = set([u"个", u"种", u"個", u"大个"]) for material in data: amounts = material.amount if material.name not in all_amount: all_amount[material.name] = {"name": material.name, "amount": 0.0, "unit": set(), "count": 0} for amount in amounts: # 对分词成3个以上的,暂时忽略,如"1/4个",会拆成 "1" "/" "4" "个" if len(amount) > 2: continue
def __init__(self): Material.drop_collection() print 'Initiating complete ! Material collection droped !!'
def find_similar(): #similar = {} similar_grid = [] global material_names pair_done = set() for mid1 in material_names: for mid2 in material_names: #对称词对跳过 if (mid2, mid1) in pair_done: continue pair_done.add((mid1, mid2)) char_inter = material_names[mid1]['char'].intersection( material_names[mid2]['char']) if mid1 == mid2 or len(char_inter) == 0: continue word_inter = material_names[mid1]['word'].intersection( material_names[mid2]['word']) #不要动这行,以下计算基于这些元素顺序取值 res = [ len(word_inter), len(char_inter), len(material_names[mid1]['word']), len(material_names[mid2]['word']), len(material_names[mid1]['char']), len(material_names[mid2]['char']), ''.join(char_inter) ] #res += [len(material_names[mid1]['cut']),len(material_names[mid2]['cut'])] res += [ Material.objects(id=mid1).only('name')[0].name, Material.objects(id=mid2).only('name')[0].name ] #res += [] #similar[(mid1,mid2)] = tuple(res) similar_grid.append([mid1, mid2] + res) #for s in similar: # print '%s,%s'%(s,similar[s]) df = pd.DataFrame(similar_grid) #df = pd.DataFrame(similar_grid[:1000]) cl = len(similar_grid[0]) - 1 df[cl + 1] = df[2] / df[4] #m1 word ratio df[cl + 2] = df[2] / df[5] #m2 word ratio df[cl + 3] = df[3] / df[6] #m1 char ratio df[cl + 4] = df[3] / df[7] #m2 char ratio df[cl + 5] = (df[cl + 1] + df[cl + 2]) + (df[cl + 3] + df[cl + 4]) df = df.sort(columns=15, ascending=False) df.to_csv('res.csv') #sys.exit() #df[8] = df[6]+df[7] def print_name(row): #if row[cl+1]==0 and row[cl+2]==0:return if (row[cl + 1] == row[cl + 2] == row[cl + 3] == row[cl + 4] == 1) or row[cl + 5] > 3: Material.combine(row[0], row[1], row[8]) #return #return elif (row[cl + 1] >= 0.5 or row[cl + 2] >= 0.5) and (row[cl + 3] >= 0.5 or row[cl + 4] >= 0.5): #try: m1 = Material.objects(id=row[0]).only('name') if m1: m1 = m1.get(0) m2 = Material.objects(id=row[1]).only('name') if m2: m2 = m2.get(0) if not m1 or not m2: return #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name Material.combine(row[0], row[1], row[8], check_dict=True) #print row[8:] #print '-'*30 #print row[cl+1:] #elif (row[cl+3]>=0.5 and row[cl+4]==1) or (row[cl+3]==1 and row[cl+4]>=0.5): elif row[cl + 3] >= 0.5 and row[cl + 4] >= 0.5: #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name Material.combine(row[0], row[1], row[8], check_dict=True) #pass #print row[8:] #print '*'*50 else: pass #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name #pass #print row[8:] #print '-'*30 df.apply(print_name, axis=1)
import os, sys sys.path.append(os.environ['DOUGUO_BASE']) from config import CONFIG from orm.Material import Material import sys reload(sys) sys.setdefaultencoding('utf-8') import pandas as pd import jieba jieba.load_userdict(CONFIG['DICT_PATH'] + "jieba_user.dict") jieba.enable_parallel(4) import jieba.posseg as pseg data = Material.objects().only('name') #[:2] #[{id:,:word:,char:}] material_names = {} def init_grams(): global material_names global data for m in data: word = [] mn = {} mn['word'] = set() mn['char'] = set() name = m.name.split(u'的')[-1] mn['origin'] = name
import os,sys sys.path.append(os.environ['DOUGUO_BASE']) from config import CONFIG from orm.Material import Material import json import pandas as pd import sys reload(sys) sys.setdefaultencoding('utf-8') import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['Droid Sans Fallback'] #_materials = json.loads(open('../data/materials.json','r').read()) _materials = Material.objects() #materials = [{'name':u'%s'%m,'count':len(_materials[m])} for m in _materials if len(_materials[m])>=10] materials = [{'name':u'%s'%m.name,'count':len(m.amount)} for m in _materials if len(m.amount)>=10] df = pd.DataFrame(materials).sort(columns='count',ascending=False) #导出成json open('result/material_count.json','w').write(json.dumps(df.to_dict(outtype='records'))) fig = df.plot(kind='barh',legend=False,fontsize=6).get_figure() plt.yticks(xrange(df.count()[0]), df['name']) plt.xlabel(u'使用次数') plt.ylabel(u'食材') plt.title(u'全部作品中各食材的使用次数') fig.savefig('out.png')
from config import CONFIG from orm.Material import Material import json import pandas as pd import sys reload(sys) sys.setdefaultencoding('utf-8') import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['Droid Sans Fallback'] #_materials = json.loads(open('../data/materials.json','r').read()) _materials = Material.objects() #materials = [{'name':u'%s'%m,'count':len(_materials[m])} for m in _materials if len(_materials[m])>=10] materials = [{ 'name': u'%s' % m.name, 'count': len(m.amount) } for m in _materials if len(m.amount) >= 10] df = pd.DataFrame(materials).sort(columns='count', ascending=False) #导出成json open('result/material_count.json', 'w').write(json.dumps(df.to_dict(outtype='records'))) fig = df.plot(kind='barh', legend=False, fontsize=6).get_figure() plt.yticks(xrange(df.count()[0]), df['name']) plt.xlabel(u'使用次数') plt.ylabel(u'食材')
sys.path.append(os.environ['DOUGUO_BASE']) from config import CONFIG from orm.Material import Material import sys reload(sys) sys.setdefaultencoding('utf-8') import json import pandas as pd import re import nltk import jieba import jieba.posseg as pseg data = Material.objects() units = [] _units = {} all_amount = {} known_units = set([u'个', u'种', u'個', u'大个']) for material in data: amounts = material.amount if material.name not in all_amount: all_amount[material.name] = { 'name': material.name, 'amount': 0.0, 'unit': set(), 'count': 0 }
def find_similar(): #similar = {} similar_grid = [] global material_names pair_done = set() for mid1 in material_names: for mid2 in material_names: #对称词对跳过 if (mid2,mid1) in pair_done:continue pair_done.add((mid1,mid2)) char_inter = material_names[mid1]['char'].intersection(material_names[mid2]['char']) if mid1 == mid2 or len(char_inter) == 0 :continue word_inter = material_names[mid1]['word'].intersection(material_names[mid2]['word']) #不要动这行,以下计算基于这些元素顺序取值 res = [len(word_inter),len(char_inter),len(material_names[mid1]['word']),len(material_names[mid2]['word']),len(material_names[mid1]['char']),len(material_names[mid2]['char']),''.join(char_inter)] #res += [len(material_names[mid1]['cut']),len(material_names[mid2]['cut'])] res += [Material.objects(id=mid1).only('name')[0].name, Material.objects(id=mid2).only('name')[0].name] #res += [] #similar[(mid1,mid2)] = tuple(res) similar_grid.append([mid1,mid2]+res) #for s in similar: # print '%s,%s'%(s,similar[s]) df = pd.DataFrame(similar_grid) #df = pd.DataFrame(similar_grid[:1000]) cl = len(similar_grid[0])-1 df[cl+1] = df[2]/df[4]#m1 word ratio df[cl+2] = df[2]/df[5]#m2 word ratio df[cl+3] = df[3]/df[6]#m1 char ratio df[cl+4] = df[3]/df[7]#m2 char ratio df[cl+5] = (df[cl+1]+df[cl+2]) + (df[cl+3]+df[cl+4]) df = df.sort(columns=15,ascending=False) df.to_csv('res.csv') #sys.exit() #df[8] = df[6]+df[7] def print_name(row): #if row[cl+1]==0 and row[cl+2]==0:return if (row[cl+1] == row[cl+2] == row[cl+3] == row[cl+4] ==1) or row[cl+5]>3: Material.combine(row[0],row[1],row[8]) #return #return elif (row[cl+1]>=0.5 or row[cl+2]>=0.5) and (row[cl+3]>=0.5 or row[cl+4]>=0.5): #try: m1 = Material.objects(id=row[0]).only('name') if m1: m1 = m1.get(0) m2 = Material.objects(id=row[1]).only('name') if m2: m2 = m2.get(0) if not m1 or not m2:return #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name Material.combine(row[0],row[1],row[8],check_dict=True) #print row[8:] #print '-'*30 #print row[cl+1:] #elif (row[cl+3]>=0.5 and row[cl+4]==1) or (row[cl+3]==1 and row[cl+4]>=0.5): elif row[cl+3]>=0.5 and row[cl+4]>=0.5 : #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name Material.combine(row[0],row[1],row[8],check_dict=True) #pass #print row[8:] #print '*'*50 else: pass #print Material.objects(id=row[0]).only('name')[0].name, Material.objects(id=row[1]).only('name')[0].name #pass #print row[8:] #print '-'*30 df.apply(print_name,axis=1)
import os,sys sys.path.append(os.environ['DOUGUO_BASE']) from config import CONFIG from orm.Material import Material import sys reload(sys) sys.setdefaultencoding('utf-8') import pandas as pd import jieba jieba.load_userdict(CONFIG['DICT_PATH']+"jieba_user.dict") jieba.enable_parallel(4) import jieba.posseg as pseg data = Material.objects().only('name')#[:2] #[{id:,:word:,char:}] material_names = {} def init_grams(): global material_names global data for m in data: word = [] mn = {} mn['word'] = set() mn['char'] = set() name = m.name.split(u'的')[-1] mn['origin'] = name for pair in pseg.cut(name):