def loadGenderList(gender, country, dataPath, hasHeader): fd = open(os.path.join(dataPath, '%s%sUTF8.csv' % (country, gender)), 'rb') reader = UnicodeReader(fd) names = {} if hasHeader: unused_header = reader.next() '''Load names as-is, but lower cased''' for row in reader: name = row[0].lower() try: '''The second column should be the count (number of babies in some year with this name)''' count = row[1] except: '''If second column does not exist, default to count=1''' count = 1 if names.has_key(name): '''If here then I've seen this name before, modulo case. Only count once (there is no frequency information anyway)''' count = 0 if names.has_key(name): names[name] += count else: names[name] = count fd.close() '''Add versions without diacritics''' for name in names.keys(): dname = unidecode(name) if not names.has_key(dname): names[dname] = names[name] return names
def toDB(file_path, server_name, db_name, table_name, user, password, csv_delimiter): folder, file = os.path.split(file_path) with Util.cd(folder): reader = UnicodeReader(open(file,'rb')) db = Database(db_name, user, server_name, 5432, password) csv_delimiter = csv_delimiter if csv_delimiter != "" else ";" fields, values = getInsertCommand(reader.next()[0].split(csv_delimiter), table_name) print 'running...' v = [] while True: try: row = reader.next()[0].split(',') except Exception: break; for i in range(len(row)): if row[i] == 'None': row[i] = '' v.append(tuple(row)) args_str = ','.join(db.cur.mogrify(values, x) for x in tuple(v)) db.cur.execute("INSERT INTO " + table_name + " " + fields + " VALUES " + args_str) # db.cur.executemany(insert_command, tuple(v)) db.conn.commit() db.conn.close() print 'done!'
def load_acceptance_ratio(): session = SessionFactory.get_session() print 'Loading acceptance ratios:' f = open(os.path.join(DATA_PATH, 'numSubmissions.csv'), "rb") reader = UnicodeReader(f) header = reader.next() subm = {} for row in reader: year = int(row[0]) for idx,val in enumerate(row[1:]): conf = header[idx+1] try: count = int(val) if conf not in subm.keys(): subm[conf] = {} subm[conf][year] = count except: pass for acronym, name, impact in CONFERENCES: print acronym.upper() conference = session.query(Venue).\ filter_by(acronym=acronym.upper()).\ one() for (year,count) in subm[acronym.upper()].items(): numSubm = SubmissionsCount(year, count) numSubm.venue = conference session.add(numSubm)
def __init__(self, f): reader = UnicodeReader(f) self.content = {} i = 0 for row in reader: line = [] for r in row: line.append(r) self.content[i] = line i += 1 self.randomKeys = self.content.keys() random.shuffle(self.randomKeys)
idx = 0 for row in f.readlines(): if idx >= 362: nrow = rewrite(row) else: nrow = row idx += 1 g.write(nrow) f.close() g.close() '''Read the data into a Python dictionary''' f = open(os.path.join(dataPath, 'nameLists', 'nam_dict2.txt'), 'rb') reader = UnicodeReader(f) genderDict = {} idx = 0 shortNames = [] for row in reader: if idx > 361: text = row[0] mf = text[:2].strip() # M,1M,?M, F,1F,?F, ?, = # = <short_name> <long_name> name = text[2:29].lower().strip() sortingFlag = text[29] # +,-; ignore + frequencies = text[30:-2]
def main(): data = "../resources/SOusers-Mar13.csv" # File containing SO user dump results = "../resources/features3.csv" # File where features will be stored picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded fr = open(os.path.join(data), 'rb') fw = open(os.path.join(results), 'ab') if _RANDOM: reader = RandomReader(fr) else: reader = UnicodeReader(fr) writer = UnicodeWriter(fw) queue = Queue() if _FACE: faceDetector = FaceDetector() threads = [] SOhashes = {} # Dictionary of user's hashes # Use multiple threads to download and get information for i in xrange(10): threads.append(Downloader(queue)) threads[-1].start() idx = 0 size = 4500 # Number of subjects for row in reader: if idx < size: so_uid = row[0] so_hash = row[2] if (not (SOhashes.has_key(so_hash))): SOhashes[so_hash] = so_uid if (not isDefaultGravatarPic(so_hash)): data = [so_uid] if _VISUAL_FEATURES: # Download picture filepath = os.path.join('%s%d.jpg' % (picPath, int(so_uid))) if not os.path.isfile(filepath): queue.put( ('http://www.gravatar.com/avatar/%s' % so_hash, filepath)) time.sleep(2) # Load picture pic = picUtils.loadPicture(filepath) if _FACE: if faceDetector.isFrontFace( pic) or faceDetector.isProfileFace(pic): data.append(str(True)) else: data.append(str(False)) if _MOST_COMMON_COLORS: _, f1, _, f2 = picUtils.mostCommonColor(pic) data.append(str(f1 + f2)) if _NBCOLORS: data.append(str(picUtils.getNbOfColors(pic))) if _FARTHEST_NEIGHBOR: F1 = picUtils.farthestNeighborMetric(pic, 10) F2 = picUtils.farthestNeighborMetric(pic, 200) data.append(str(F1)) data.append(str(F2)) if F1 != 0: data.append(str(F2 / F1)) else: data.append('?') if _AVERAGE_SATURATION: data.append(str(picUtils.avgSaturation(pic))) if _THRESHOLD_BRIGHTNESS: data.append(str(picUtils.threBrightness(pic, 0.2))) if _GOOGLE: gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash) bestGuess = gi.getBestGuess() if bestGuess: bestGuess = bestGuess.encode('utf8') data.append(bestGuess) if _WIKIPEDIA: gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess) wikiTitlePage = gs.getWikipediaTitlePage() if wikiTitlePage: wiki = Wikipedia(wikiTitlePage) wiki.categoryGraph(4) nbCats = 10 i = 0 cats = wiki.sortGraphByDegree() while i < nbCats and i < len(cats): data.append(str(cats[i])) i += 1 # Write all information collected in the csv file try: print data writer.writerow(data) idx += 1 except: print "Error with data" else: break fr.close() fw.close() # If here, download finished. Stop threads for i in xrange(10): queue.put((None, None))
Session = sessionmaker(engine) session = Session() print 'Loading papers:' for conferenceName in conferences: acronym = conferenceName print acronym.upper() # Create a new conference object conference = Conference(acronym.upper(), impact[acronym.upper()]) session.add(conference) # Load the data into a csv reader f = open(os.path.join(dataPath, 'normalised-papers', '%s.csv' % conferenceName), 'rb') reader = UnicodeReader(f) for row in reader: # Deconstruct each row year = int(row[0]) author_names = [a.strip() for a in row[1].split(',')] title = row[2] pages = row[3] try: num_pages = int(row[4]) except: num_pages = 0 session_h2 = unidecode(row[5]).strip() session_h3 = unidecode(row[6]).strip() try: selected = row[7]
# exit() # Choose dataset #dataset = 'aliases2' dataset = 'gnome' dataset = 'icsm' # Choose dataset size datasetsize = 'full' #datasetsize = 'sample' nameEmailData = MyDict() if dataset == 'aliases2': f = open(os.path.join(dataPath, "aliases2.csv"), "rb") # f = open(os.path.join(dataPath, "testData2.csv"), "rb") reader = UnicodeReader(f) header = reader.next() for row in reader: try: idx = int(row[0]) name = row[1] email = unspam(row[2]) nameEmailData[idx] = (name, email) except: print row f.close() print 'Using the aliases2.csv data set...' elif dataset == 'gnome': import email.utils import email.header
def load_pc(): session = SessionFactory.get_session() # --- Update 8/12/2013 --- # Record also the role of PC members (PC Chair or General Chair) f = open(os.path.join(DATA_PATH, 'SE-conf-roles.csv'), 'rb') reader = UnicodeReader(f) header = reader.next() roles = {} def confName(conf): if conf == 'ESEC/FSE': return 'FSE' elif conf == 'CSMR-WCRE': return 'CSMR' else: return conf for row in reader: conf = confName(row[0].strip()).lower() year = int(row[1]) name = '%s %s' % (row[2].strip(), row[3].strip()) try: name = nameMap[name] except: pass role = row[5] if role == 'Organiser': role = 'PC member main track' if '?' not in name and role != 'Challenge Chair' and role != 'Data Chair': roles[(name, conf, year)] = role #Conference;Year;First Name;Last Name;Sex;Role #CSMR;2013;Anthony ;Cleve;Male;Program Chair #CSMR;2013;Filippo;Ricca;Male;Program Chair #CSMR;2013;Maura;Cerioli;Female;General Chair # ----------------------- print 'Loading PC members:' for acronym, name, impact in CONFERENCES: print acronym.upper() # Get the conference object try: # I already have this PC conference in the database conference = session.query(Venue).\ filter_by(acronym=acronym).\ one() except: # New conference; add to database conference = Venue(acronym.upper(), impact, name, is_conference=True) session.add(conference) # Load the data into a csv reader f = open(os.path.join(DATA_PATH, 'normalised-pc', '%s.csv' % acronym.lower()), 'rb') reader = UnicodeReader(f) # --- Update 8/12/2013 --- withRole = set([(name, year) for (name, conf, year) in roles.keys() if conf==acronym]) # ----------------------- for row in reader: # Deconstruct each row year = int(row[0]) role = row[1] pcMemberName = row[2].strip() # --- Update 8/12/2013 --- if roles.has_key((pcMemberName, acronym, year)): role = roles[(pcMemberName, acronym, year)] try: withRole.remove((pcMemberName, year)) except: pass else: role = 'PC member main track' # ----------------------- if len(pcMemberName): # Get the PC member object try: # I already have this PC member in the database pcMember = session.query(Person).\ filter_by(name=pcMemberName).\ one() except: # New person; add to database pcMember = Person(pcMemberName) session.add(pcMember) try: membership = session.query(PCMembership).\ filter_by(year=year).\ filter_by(role=role).\ filter_by(pcmember=pcMember).\ filter_by(venue=conference).\ one() except: # New, add to database membership = PCMembership(year, role) membership.pcmember = pcMember membership.venue = conference session.add(membership) # --- Update 8/12/2013 --- print sorted(withRole) # ----------------------- session.commit()
dataPath = os.path.abspath("../../../data") #print normaliseName(u'Liz Burd') #exit() conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase'] for conference in conferences: g = open(os.path.join(dataPath, 'normalised-pc', '%s.csv' % conference), 'wb') writer = UnicodeWriter(g) f1 = open(os.path.join(dataPath, 'pc', '%s.csv' % conference), 'rb') reader1 = UnicodeReader(f1) for row in reader1: year = row[0] track = row[1] if track == 'main': pc = ','.join([normaliseName(name) for name in row[2].split(',') if len(normaliseName(name))]) writer.writerow([year, track, pc]) g.close() exit() #conferences = ['ase'] allAuthorsSet = set() for conference in conferences:
import os import sys sys.path.append('..') from unicodeMagic import UnicodeReader, UnicodeWriter from dictUtils import MyDict from nameMap import nameMap from unidecode import unidecode #conference = "icse" conference = sys.argv[1] # This is the list of DBLP author names (>1.1M people) # 335078;M. G. J. van den Brand, Mark G. J. van den Brand, Mark van den Brand f = open(os.path.abspath("../../../data/dblp-author-aliases-stripped.csv"), "rb") reader1 = UnicodeReader(f) # Read the list into a map # reverseLookup['M. G. J. van den Brand'] # = reverseLookup['Mark G. J. van den Brand'] # = reverseLookup['Mark van den Brand'] # = 335078 reverseLookup = MyDict() for row in reader1: aid = int(row[0]) aliases = [name.strip() for name in row[1].split(',')] for name in aliases: reverseLookup[name] = aid # Read names of conference PC members # There are two cases: