def toDB(file_path, server_name, db_name, table_name, user, password, csv_delimiter): folder, file = os.path.split(file_path) with Util.cd(folder): reader = UnicodeReader(open(file,'rb')) db = Database(db_name, user, server_name, 5432, password) csv_delimiter = csv_delimiter if csv_delimiter != "" else ";" fields, values = getInsertCommand(reader.next()[0].split(csv_delimiter), table_name) print 'running...' v = [] while True: try: row = reader.next()[0].split(',') except Exception: break; for i in range(len(row)): if row[i] == 'None': row[i] = '' v.append(tuple(row)) args_str = ','.join(db.cur.mogrify(values, x) for x in tuple(v)) db.cur.execute("INSERT INTO " + table_name + " " + fields + " VALUES " + args_str) # db.cur.executemany(insert_command, tuple(v)) db.conn.commit() db.conn.close() print 'done!'
def loadGenderList(gender, country, dataPath, hasHeader): fd = open(os.path.join(dataPath, '%s%sUTF8.csv' % (country, gender)), 'rb') reader = UnicodeReader(fd) names = {} if hasHeader: unused_header = reader.next() '''Load names as-is, but lower cased''' for row in reader: name = row[0].lower() try: '''The second column should be the count (number of babies in some year with this name)''' count = row[1] except: '''If second column does not exist, default to count=1''' count = 1 if names.has_key(name): '''If here then I've seen this name before, modulo case. Only count once (there is no frequency information anyway)''' count = 0 if names.has_key(name): names[name] += count else: names[name] = count fd.close() '''Add versions without diacritics''' for name in names.keys(): dname = unidecode(name) if not names.has_key(dname): names[dname] = names[name] return names
def load_acceptance_ratio(): session = SessionFactory.get_session() print 'Loading acceptance ratios:' f = open(os.path.join(DATA_PATH, 'numSubmissions.csv'), "rb") reader = UnicodeReader(f) header = reader.next() subm = {} for row in reader: year = int(row[0]) for idx,val in enumerate(row[1:]): conf = header[idx+1] try: count = int(val) if conf not in subm.keys(): subm[conf] = {} subm[conf][year] = count except: pass for acronym, name, impact in CONFERENCES: print acronym.upper() conference = session.query(Venue).\ filter_by(acronym=acronym.upper()).\ one() for (year,count) in subm[acronym.upper()].items(): numSubm = SubmissionsCount(year, count) numSubm.venue = conference session.add(numSubm)
author = session.query(Person).\ filter_by(name=author_name).\ one() except: # New author; add to database author = Person(author_name) session.add(author) paper.authors.append(author) # --- Update 8/12/2013 --- # Record also the role of PC members (PC Chair or General Chair) f = open(os.path.join(dataPath, 'SE-conf-roles.csv'), 'rb') reader = UnicodeReader(f) header = reader.next() roles = {} def confName(conf): if conf == 'ESEC/FSE': return 'FSE' elif conf == 'CSMR-WCRE': return 'CSMR' else: return conf for row in reader: conf = confName(row[0].strip()).lower() year = int(row[1]) name = '%s %s' % (row[2].strip(), row[3].strip()) try:
# Choose dataset #dataset = 'aliases2' dataset = 'gnome' dataset = 'icsm' # Choose dataset size datasetsize = 'full' #datasetsize = 'sample' nameEmailData = MyDict() if dataset == 'aliases2': f = open(os.path.join(dataPath, "aliases2.csv"), "rb") # f = open(os.path.join(dataPath, "testData2.csv"), "rb") reader = UnicodeReader(f) header = reader.next() for row in reader: try: idx = int(row[0]) name = row[1] email = unspam(row[2]) nameEmailData[idx] = (name, email) except: print row f.close() print 'Using the aliases2.csv data set...' elif dataset == 'gnome': import email.utils import email.header
def load_pc(): session = SessionFactory.get_session() # --- Update 8/12/2013 --- # Record also the role of PC members (PC Chair or General Chair) f = open(os.path.join(DATA_PATH, 'SE-conf-roles.csv'), 'rb') reader = UnicodeReader(f) header = reader.next() roles = {} def confName(conf): if conf == 'ESEC/FSE': return 'FSE' elif conf == 'CSMR-WCRE': return 'CSMR' else: return conf for row in reader: conf = confName(row[0].strip()).lower() year = int(row[1]) name = '%s %s' % (row[2].strip(), row[3].strip()) try: name = nameMap[name] except: pass role = row[5] if role == 'Organiser': role = 'PC member main track' if '?' not in name and role != 'Challenge Chair' and role != 'Data Chair': roles[(name, conf, year)] = role #Conference;Year;First Name;Last Name;Sex;Role #CSMR;2013;Anthony ;Cleve;Male;Program Chair #CSMR;2013;Filippo;Ricca;Male;Program Chair #CSMR;2013;Maura;Cerioli;Female;General Chair # ----------------------- print 'Loading PC members:' for acronym, name, impact in CONFERENCES: print acronym.upper() # Get the conference object try: # I already have this PC conference in the database conference = session.query(Venue).\ filter_by(acronym=acronym).\ one() except: # New conference; add to database conference = Venue(acronym.upper(), impact, name, is_conference=True) session.add(conference) # Load the data into a csv reader f = open(os.path.join(DATA_PATH, 'normalised-pc', '%s.csv' % acronym.lower()), 'rb') reader = UnicodeReader(f) # --- Update 8/12/2013 --- withRole = set([(name, year) for (name, conf, year) in roles.keys() if conf==acronym]) # ----------------------- for row in reader: # Deconstruct each row year = int(row[0]) role = row[1] pcMemberName = row[2].strip() # --- Update 8/12/2013 --- if roles.has_key((pcMemberName, acronym, year)): role = roles[(pcMemberName, acronym, year)] try: withRole.remove((pcMemberName, year)) except: pass else: role = 'PC member main track' # ----------------------- if len(pcMemberName): # Get the PC member object try: # I already have this PC member in the database pcMember = session.query(Person).\ filter_by(name=pcMemberName).\ one() except: # New person; add to database pcMember = Person(pcMemberName) session.add(pcMember) try: membership = session.query(PCMembership).\ filter_by(year=year).\ filter_by(role=role).\ filter_by(pcmember=pcMember).\ filter_by(venue=conference).\ one() except: # New, add to database membership = PCMembership(year, role) membership.pcmember = pcMember membership.venue = conference session.add(membership) # --- Update 8/12/2013 --- print sorted(withRole) # ----------------------- session.commit()