def wordcount(filename, ent_file, tfidf, text, id): resources = open(filename) resources.readline() # header wordcount = TFIDF(get_entities(ent_file)) for id, lines in groupby(csv.reader(resources), id): maintext = ' '.join(text(line).lower() for line in lines) wordcount.process(maintext) wordcount.done() out = open(tfidf, 'w') for word, _, _, tfidf in wordcount.highest(200): out.write('%s\t%f\n' % (word, tfidf))
def count(district, type='essays', extract_text=lambda line: ' '.join(line[3:10]), id=lambda line: line[0]): (_projectid, _teacher_acctid, _schoolid, school_ncesid, school_latitude, school_longitude, school_city, school_state, school_zip, school_metro, school_district, school_county, school_charter, school_magnet, school_year_round, school_nlns, school_kipp, school_charter_ready_promise, teacher_prefix, teacher_teach_for_america, teacher_ny_teaching_fellow, primary_focus_subject, primary_focus_area, secondary_focus_subject, secondary_focus_area, resource_usage, resource_type, poverty_level, grade_level, vendor_shipping_charges, sales_tax, payment_processing_charges, fulfillment_labor_materials, total_price_excluding_optional_support, total_price_including_optional_support, students_reached, used_by_future_students, total_donations, num_donors, eligible_double_your_impact_match, eligible_almost_home_match, funding_status, date_posted, date_completed, date_thank_you_packet_mailed, date_expiration) = range(46) proj_ids = [] projects = open('../data/projects.%scsv' % district) projects.readline().strip() # header for proj in csv.reader(projects): if proj[date_posted].startswith('2011'): proj_ids.append(proj[0]) proj_ids = frozenset(proj_ids) projects.close() wordcount = TFIDF(get_entities(ent_file)) essays = open('../data/%s.%scsv' % (type, district)) essays.readline() # header for proid, lines in groupby(csv.reader(essays), id): if proid in proj_ids: text = ' '.join(extract_text(line) for line in lines).lower() wordcount.process(text) wordcount.done() essays.close() out = open('../data/wc_%s%scsv' % (type, district), 'w') for word, tf, df, tfidf in wordcount.highest(0): out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))
def count(district, type='essays', extract_text=lambda line: ' '.join(line[3:10]), id=lambda line:line[0]): (_projectid,_teacher_acctid,_schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,school_county,school_charter,school_magnet,school_year_round,school_nlns,school_kipp,school_charter_ready_promise,teacher_prefix,teacher_teach_for_america,teacher_ny_teaching_fellow,primary_focus_subject,primary_focus_area,secondary_focus_subject,secondary_focus_area,resource_usage,resource_type,poverty_level,grade_level,vendor_shipping_charges,sales_tax,payment_processing_charges,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,used_by_future_students,total_donations,num_donors,eligible_double_your_impact_match,eligible_almost_home_match,funding_status,date_posted,date_completed,date_thank_you_packet_mailed,date_expiration) = range(46) proj_ids = [] projects = open('../data/projects.%scsv' % district) projects.readline().strip() # header for proj in csv.reader(projects): if proj[date_posted].startswith('2011'): proj_ids.append(proj[0]) proj_ids = frozenset(proj_ids) projects.close() wordcount = TFIDF(get_entities(ent_file)) essays = open('../data/%s.%scsv' % (type, district)) essays.readline() # header for proid, lines in groupby(csv.reader(essays), id): if proid in proj_ids: text = ' '.join(extract_text(line) for line in lines).lower() wordcount.process(text) wordcount.done() essays.close() out = open('../data/wc_%s%scsv' % (type, district), 'w') for word, tf, df, tfidf in wordcount.highest(0): out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))