示例#1
0
文件: driver.py 项目: gpanda/infzm
 def write_articles(self):
     for k, v in self.contents.iteritems():
         if 'articles' in v:
             create_dir(k)
             cwd = os.getcwdu()
             os.chdir(k)
             for url, article in v['articles'].iteritems():
                 words = article['article'] + u'\n'
                 self.write_page(words, article['title'] + ".htm")
             os.chdir(cwd)
示例#2
0
文件: driver.py 项目: gpanda/infzm
    def write(self):
        create_dir(self.date)
        cwd = os.getcwdu()
        os.chdir(self.date)

        self.write_cover()
        self.write_topnews_s()
        self.write_contents()
        self.write_topnews()
        self.write_articles()

        os.chdir(cwd)
示例#3
0
 def setup_epub_root(self):
     create_dir(self.epub_root)
     template_root = os.environ['PY_ROOT'] + os.sep + "epub_templates"
     copy(template_root + os.sep + "headerLogo.png", self.epub_root)
     self.book.master_head_image.path = self.book.master_head_image.name
     copy(template_root + os.sep + "mimetype", self.epub_root)
     copy(template_root + os.sep + "page_styles.css", self.epub_root)
     copy(template_root + os.sep + "stylesheet.css", self.epub_root)
     copy(template_root + os.sep + "titlepage.xhtml", self.epub_root)
     meta_inf = self.epub_root + os.sep + "META-INF"
     if not os.path.exists(meta_inf):
         copytree(template_root + os.sep + "META-INF", meta_inf)
     dl_path = self.epub_root + os.sep + self.book.cover.name
     self.book.cover.path = self.book.cover.name
     download_image(self.book.cover.url, dl_path)
示例#4
0
 def edit_ratio_histogram(self):
     basic.log('creating edit histogram %s' % self.lang)
     f_out = basic.create_dir('results/ratio_histograms')
     df = pd.read_csv(self.db_path)
     df.page_id = df.page_id.astype(float)
     df = df.loc[df['linked_id'] != None]
     df.linked_id = df.linked_id.astype(float)
     df = self.drop_dups(df)
     basic.log('dropped %s duplicates' % len(df.set_index('page_id',drop=False).index.get_duplicates()))
     df = df.drop_duplicates(subset='page_id',keep=False)
     if self.drop1:
         df = df.loc[(df['len'] > 1)]
     for r in self.revert:
         basic.log('%s %s' % (self.lang,r))
         basic.log('%s pages' % len(df))
         n0 = df.loc[(df['namespace'] == 0)].set_index('page_id',drop=False)
         n1 = df.loc[(df['namespace'] == 1)].set_index('linked_id',drop=False)
         basic.log('%s articles' % len(n0))
         basic.log('%s talk' % len(n1))
         ratio = n0[r].divide(n1[r],axis='index',fill_value=-1).to_frame()
         ratio.columns = ['ratio']
         ratio.ratio = ratio.ratio.astype(int)
         ratio = n0.join(ratio).set_index('page_id')
         ratio = ratio.loc[ratio['ratio'] >= 0]
         basic.log('%s ratios' % len(ratio))
         result = ratio['ratio'].value_counts().to_frame()
         result = result.sort_index(ascending=True)
         result.columns = ['pages']
         result.to_csv('%s/%s_%s.csv' % (f_out,self.lang,r),encoding='utf-8',index_label='edit_ratio')
示例#5
0
 def edit_quantiles(self,q=.01,quantile_range=False,v=False,write=True):
     basic.log('creating edit quantiles %s' % self.lang)
     f_out = basic.create_dir('results/quantiles')
     df = pd.read_csv(self.db_path)
     df = self.drop_dups(df)
     df.page_id = df.page_id.astype(int)
     if self.drop1:
         df = df.loc[(df['len'] > 1)]
     q = np.arange(q,1+q,q)
     results = defaultdict(dict)
     for n in self.namespace:
         results[n] = defaultdict(dict)
         for r in self.revert:
             basic.log('%s %s %s' % (self.lang,n,r))
             if n == 'at':
                 result = df[r].quantile(q=q)
                 mean = df[r].mean()
             else:
                 result = df.loc[(df['namespace'] == self.namespace.index(n)),r].quantile(q=q)
                 #qcut = pd.qcut(df.loc[(df['namespace'] == self.namespace.index(n)),r],q)
                 #print(qcut)
                 mean = df.loc[(df['namespace'] == self.namespace.index(n)),r].mean()
             result = result.to_frame()
             column = '%s_%s_%s' % (self.lang,n,r)
             result.columns = [column]
             results[n][r] = {'quantiles':result,'mean':mean}
             if write:
                 result = result.append(DataFrame({column:result.loc[(result[column] < int(mean+1))].tail(1).index.values},index=['mean_quantile']))
                 result = result.append(DataFrame({column:mean},index=['mean_value']))
                 result.to_csv('%s/%s_%s_%s.csv' % (f_out,self.lang,n,r),encoding='utf-8',index_label='qauntiles')
     return results
示例#6
0
 def edit_histogram(self,plot=True,v=False):
     basic.log('creating edit histogram %s' % self.lang)
     f_out = basic.create_dir('results/histograms')
     df = pd.read_csv(self.db_path)
     df = self.drop_dups(df)
     if self.drop1:
         df = df.loc[(df['len'] > 1)]
     for n in self.namespace:
         for r in self.revert:
             basic.log('%s %s %s' % (self.lang,n,r))
             if n == 'at':
                 result = df[r].value_counts()
             else:
                 result = df.loc[(df['namespace'] == self.namespace.index(n)),r].value_counts()
             result = result.sort_index(ascending=True)
             result.columns = ['articles']
             result.to_csv('%s/%s_%s_%s.csv' % (f_out,self.lang,n,r),encoding='utf-8',index_label='edits')
示例#7
0
 def edit_statistics(self,statistics,v=False):
     f_out = basic.create_dir('results/basic_stats')
     if self.drop1:
         f = open('%s/edits_drop1_%s.csv' % (f_out,self.lang),'w')
     else:
         f = open('%s/edits_%s.csv' % (f_out,self.lang),'w')
     header = '"lang"'
     for n in self.namespace:
         for r in self.revert:
             for s in statistics:    
                 header = header + ((',"%s_%s_%s"') % (n,s,r))
     header = header + '\n'
     f.write(header)
     result = defaultdict(dict)
     f.write('"%s"' % self.lang)
     result[self.lang] = defaultdict(dict)
     df = pd.read_csv(self.db_path)
     df = self.drop_dups(df)
     if self.drop1:
         df = df.loc[(df['len'] > 1)]
     for n in self.namespace:
         result[self.lang][n] = defaultdict(dict)
         for r in self.revert:
             result[self.lang][n][r] = defaultdict(dict)
             basic.log('%s %s %s' % (self.lang,n,r))
             for s in statistics:
                 if s == 'total':
                     if n == 'at':
                         result[self.lang][n][r][s] = df[r].sum()
                     else:
                         result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].sum()
                 elif s == 'var':
                     if n == 'at':
                         result[self.lang][n][r][s] = df[r].var()
                     else:
                         result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].var()
                 elif s == 'std':
                     if n == 'at':
                         result[self.lang][n][r][s] = df[r].std()
                     else:
                         result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].std()
                 elif s == 'mean':
                     if n == 'at':
                         result[self.lang][n][r][s] = df[r].mean()
                     else:
                         result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].mean()
                 elif s == 'median':
                     if n == 'at':
                         result[self.lang][n][r][s] = df[r].median()
                     else:
                         result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].median()
                 elif s == 'total_ratio':
                     if n == 't':
                         result[self.lang][n][r][s] = float(result[self.lang]['a'][r]['total'])/result[self.lang]['t'][r]['total']
                 elif s == 'mean_ratio':
                     if self.namespace.index(n) == (len(self.namespace)-1):
                         result[self.lang][n][r][s] = float(result[self.lang]['a'][r]['mean'])/result[self.lang]['t'][r]['mean']
                 elif s == 'missing_talk':
                     if self.namespace.index(n) == (len(self.namespace)-1):
                         result[self.lang][n][r][s] = len(df.loc[(df['linked_id'] == 'NONE')])
                 
                 f.write(',%s' % result[self.lang][n][r][s])
     f.write('\n')
     f.close()
     return result