def tabulate2CSV(outPath, metric, k=None, datatype='float'): allYears = set() for conferenceName in conferences: allYears.update(metrics[conferenceName].getMetric(metric, k).keys()) f = open(outPath, 'wb') writer = UnicodeWriter(f) header = ['year'] + [c.upper() for c in conferences] headerStr = [str(item) for item in header] writer.writerow(headerStr) for year in reversed(sorted(allYears)): row = [year] for conferenceName in conferences: try: if datatype == 'float': row.append( '%.03f' % metrics[conferenceName].getMetric(metric, k)[year]) elif datatype == 'int': row.append(metrics[conferenceName].getMetric(metric, k)[year]) except: row.append('') rowStr = [str(item) for item in row] writer.writerow(rowStr) f.close()
def scatterPlot(outPath, metric1, metric2, k1=None, k2=None): f = open(outPath, 'wb') writer = UnicodeWriter(f) writer.writerow( ['year', '%s%d' % (metric1, k1), '%s%d' % (metric2, k2), 'conference']) for conferenceName in conferences: for year in reversed( sorted( set(metrics[conferenceName].getMetric( metric1, k1).keys()).intersection( set(metrics[conferenceName].getMetric( metric2, k2).keys())))): x = metrics[conferenceName].getMetric(metric1, k1)[year] y = metrics[conferenceName].getMetric(metric2, k2)[year] row = [year, x, y, conferenceName] rowStr = [str(item) for item in row] writer.writerow(rowStr) f.close()
def main(): data = "../resources/SOusers-Mar13.csv" # File containing SO user dump results = "../resources/features3.csv" # File where features will be stored picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded fr = open(os.path.join(data), 'rb') fw = open(os.path.join(results), 'ab') if _RANDOM: reader = RandomReader(fr) else: reader = UnicodeReader(fr) writer = UnicodeWriter(fw) queue = Queue() if _FACE: faceDetector = FaceDetector() threads = [] SOhashes = {} # Dictionary of user's hashes # Use multiple threads to download and get information for i in xrange(10): threads.append(Downloader(queue)) threads[-1].start() idx = 0 size = 4500 # Number of subjects for row in reader: if idx < size: so_uid = row[0] so_hash = row[2] if (not (SOhashes.has_key(so_hash))): SOhashes[so_hash] = so_uid if (not isDefaultGravatarPic(so_hash)): data = [so_uid] if _VISUAL_FEATURES: # Download picture filepath = os.path.join('%s%d.jpg' % (picPath, int(so_uid))) if not os.path.isfile(filepath): queue.put( ('http://www.gravatar.com/avatar/%s' % so_hash, filepath)) time.sleep(2) # Load picture pic = picUtils.loadPicture(filepath) if _FACE: if faceDetector.isFrontFace( pic) or faceDetector.isProfileFace(pic): data.append(str(True)) else: data.append(str(False)) if _MOST_COMMON_COLORS: _, f1, _, f2 = picUtils.mostCommonColor(pic) data.append(str(f1 + f2)) if _NBCOLORS: data.append(str(picUtils.getNbOfColors(pic))) if _FARTHEST_NEIGHBOR: F1 = picUtils.farthestNeighborMetric(pic, 10) F2 = picUtils.farthestNeighborMetric(pic, 200) data.append(str(F1)) data.append(str(F2)) if F1 != 0: data.append(str(F2 / F1)) else: data.append('?') if _AVERAGE_SATURATION: data.append(str(picUtils.avgSaturation(pic))) if _THRESHOLD_BRIGHTNESS: data.append(str(picUtils.threBrightness(pic, 0.2))) if _GOOGLE: gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash) bestGuess = gi.getBestGuess() if bestGuess: bestGuess = bestGuess.encode('utf8') data.append(bestGuess) if _WIKIPEDIA: gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess) wikiTitlePage = gs.getWikipediaTitlePage() if wikiTitlePage: wiki = Wikipedia(wikiTitlePage) wiki.categoryGraph(4) nbCats = 10 i = 0 cats = wiki.sortGraphByDegree() while i < nbCats and i < len(cats): data.append(str(cats[i])) i += 1 # Write all information collected in the csv file try: print data writer.writerow(data) idx += 1 except: print "Error with data" else: break fr.close() fw.close() # If here, download finished. Stop threads for i in xrange(10): queue.put((None, None))
print c, d pc_c = metrics[c].pcPerYear pc_d = metrics[d].pcPerYear a_c = metrics[c].authorsPerYear a_d = metrics[d].authorsPerYear cm_c = metrics[c].membersPerYear cm_d = metrics[d].membersPerYear allYears = set(cm_c.keys()).intersection(cm_d.keys()) outPath = os.path.join(metricsPath, 'pairwise', '%s_%s.csv' % (c, d)) f = open(outPath, 'wb') writer = UnicodeWriter(f) header = [ 'YEAR', 'PC1', 'PC2', 'PC1_INT_PC2', 'PC1_UNI_PC2', 'PC1_INT_PC2__REL__PC1_UNI_PC2', 'PC1_INT_PC2__REL__PC1', 'PC1_INT_PC2__REL__PC2', 'A1', 'A2', 'A1_INT_A2', 'A1_UNI_A2', 'A1_INT_A2__REL__A1_UNI_A2', 'A1_INT_A2__REL__A1', 'A1_INT_A2__REL__A2', 'CM1', 'CM2', 'CM1_INT_CM2', 'CM1_UNI_CM2', 'CM1_INT_CM2__REL__CM1_UNI_CM2', 'CM1_INT_CM2__REL__CM1', 'CM1_INT_CM2__REL__CM2' ] writer.writerow(header) for year in reversed(sorted(allYears)): pc_c_int_pc_d = pc_c[year].intersection(pc_d[year]) pc_c_uni_pc_d = pc_c[year].union(pc_d[year])
def buildSampleSet(inputFile, sampleFile): f = open(os.path.join(inputFile), "rb") f1 = open(os.path.join(sampleFile), "wb") reader = RandomReader(f) writer = UnicodeWriter(f1) nbRows = 0 categories = [] countPages = [] for row in reader: nbRows += 1 for cat in row[15:25]: if cat != "?": if not cat in categories: categories.append(cat) countPages.append(0) data = [] for row in reader: line = [] for d in row[0:15]: line.append(d) for ind, cat in enumerate(categories): if cat in row[15:25]: countPages[ind] += 1 line.append(str(1)) else: line.append(str('?')) data.append(line) i = 0 filteredCategories = [] for cat, cpt in zip(categories, countPages): if cpt < 6: # Filter categories and keep only those which have more than 6 subjects in it ind = 15 + i for d in data: d.pop(ind) else: filteredCategories.append(cat) i += 1 # Header header = [ "uid", "cl1", "cl2", "cl3", "cl4", "cl5", "face", "fCols", "nbCols", "f1", "f2", "f3", "s", "b", "bestGuess" ] domain = [ "c", "d", "d", "d", "d", "d", "d", "c", "c", "c", "c", "c", "c", "c", "string" ] attribut = ["m", "c", "m", "m", "m", "m"] for cat in filteredCategories: header.append(cat) domain.append("d") writer.writerow(header) writer.writerow(domain) writer.writerow(attribut) i = 0 for row in data: writer.writerow(row) i += 1 f.close() f1.close()