import warnings import numpy as np with warnings.catch_warnings(): warnings.simplefilter("ignore") from pandas import DataFrame from feature_extraction.dataLoader import DataLoader loader = DataLoader() loader.loadAll() reviewTable = [] metaReviewTable = [] bidTable = [] paperTable = [] userTable = [] for id, review in loader.reviews.iteritems(): maxDist = 7 sumDist = 0 dists = [] for author in review.paper.authors: if author.id in review.user.distances: dist = review.user.distances[author.id] sumDist += dist dists.append(dist) else: sumDist += maxDist dists.sort()
from feature_extraction.dataLoader import DataLoader from feature_extraction.tfIdf import tf_idf from scipy.sparse import dok_matrix from scipy import * import cPickle from collections import defaultdict from sets import Set steps = 11 #LOAD IN DATA print "(1/%d) Loading Data" % steps loader = DataLoader() loader.loadUsers() loader.loadPapers() loader.loadPastPapers() loader.loadReviews() loader.loadAcceptance() #COMPUTE IDF print "(2/%d) Computing IDF" % steps tfidf = tf_idf() tfidf.computeIdf(loader, [ r.getReviewText() for id, r in loader.reviews.iteritems() ]) #SET UP TERM DICTIONARY def incrementAndReturn(): currId[0] += 1
import warnings from pylab import * with warnings.catch_warnings(): warnings.simplefilter("ignore") import pandas as pd import numpy as np from feature_extraction.dataLoader import DataLoader from feature_extraction import calcFeatures from utilities.plotBucket import plotBucket from utilities.plotBucket import plotBar from utilities.plotBucket import plotFrequencyHistogram import math import random loader = DataLoader() loader.loadAll(distance = False) print "Calculating Features" calcFeatures.calcAuthorsPastPapers(loader) calcFeatures.calcTopConfsJoursCount(loader) calcFeatures.computeAverages(loader) df = pd.read_pickle( "savedFrames/predictionFeatures/paperTable") exp = 'maxTopPaperCount' target = 'avgRating' numBuckets = 7 percentiles = (100.0/numBuckets)*np.arange(numBuckets + 1) buckets = np.percentile(df[exp].values, percentiles.tolist()) buckets[0] = -1 averages = []
from feature_extraction.dataLoader import DataLoader from feature_extraction.tfIdf import tf_idf from scipy.sparse import * from scipy import io from scipy import * import cPickle from collections import defaultdict print "Begin Loading Data" loader = DataLoader() loader.loadUsers() loader.loadPapers() loader.loadPastPapers() loader.loadAbstracts() print "End Loading Data" print "Begin Computing TF-IDF Vectors" tfidf = tf_idf() tfidf.store_tf_idf(loader, allPapers=True) print "End Computing TF-IDF Vectors" #Rows in Matrix m = len(loader.papers) + len(loader.pastPapers) #Initial Columns in Matrix n = [100000] currRow = [0] #Term -> term id dictionary
import warnings from feature_extraction import calcFeatures import numpy as np with warnings.catch_warnings(): warnings.simplefilter("ignore") from pandas import DataFrame from feature_extraction.dataLoader import DataLoader from feature_extraction.tfIdf import tf_idf loader = DataLoader() loader.loadAll() tfidf = tf_idf() tfidf.store_tf_idf(loader) print "Calculating Features" calcFeatures.calcAuthorsPastPapers(loader) calcFeatures.calcTopConfsJoursCount(loader) calcFeatures.computeAverages(loader) calcFeatures.computeDistances(loader.reviews) print "Constructing Paper Table" paperTable = [] for id, paper in loader.papers.iteritems(): maxAuthor = sorted(paper.authors, key=lambda a: len(a.pastPapers))[-1] maxTopAuthor = sorted(paper.authors, key=lambda a: a.topPastPapers)[-1] maxKDDAuthor = sorted(paper.authors, key=lambda a: a.topKDDPast)[-1] numAuthors = len(paper.authors)
import datetime from pylab import * from scipy.interpolate import spline from feature_extraction import calcFeatures from utilities.plotBucket import setUpFigure from utilities.plotBucket import plotBucket from utilities.dates import * from scipy.stats import percentileofscore with warnings.catch_warnings(): warnings.simplefilter("ignore") from pandas import DataFrame from feature_extraction.dataLoader import DataLoader loader = DataLoader() loader.loadUsers() loader.loadPapers() loader.loadPastPapers() loader.loadAcceptance() loader.loadReviews() loader.loadClassifierAccuracy() calcFeatures.computeAverages(loader) reviewerTable = [] oneDayBefore = int(datetime.datetime(2014, 4, 14, 0, 0, 0).strftime('%s')) deadline = int(datetime.datetime(2014, 4, 15, 0, 0, 0).strftime('%s')) for id, reviewer in loader.reviewers.iteritems(): revs = reviewer.reviews
f.close() return obj #types = ["Text"] types = ["Comment", "Strength", "Weakness"] termDict = readFile("termDict.dat") paperIds = readFile("paperIds.dat") paperMatrices = [readFile("paper" + t + "Matrix.mat") for t in types] reviewInfo = readFile("reviewInfo.dat") reviewMatrices = [readFile("review" + t + "Matrix.mat") for t in types] print "Load Data" loader = DataLoader() loader.loadUsers() loader.loadPapers() loader.loadReviews() loader.loadAcceptance() def performCV(X, y, model, n=4): predictions = np.zeros(X.shape[0]) X = np.array(X) X = X.T[~np.all(X == 0, axis=0)].T X = normalize(X, axis=1, norm='l2') # X = StandardScaler().fit_transform(X) pca = PCA(n_components=X.shape[0] / 30) X = pca.fit_transform(X)
from feature_extraction.dataLoader import DataLoader from feature_extraction.tfIdf import tf_idf from scipy.sparse import dok_matrix from scipy import * import cPickle from collections import defaultdict from sets import Set steps = 7 #LOAD IN DATA print "(1/%d) Loading Data" % steps loader = DataLoader() loader.loadUsers() loader.loadPastPapers() loader.loadPapers() loader.loadAcceptance() #COMPUTE IDF print "(2/%d) Computing IDF" % steps tfidf = tf_idf() tfidf.computeIdf(loader) #SET UP TERM DICTIONARY def incrementAndReturn(): currId[0] += 1 return currId[0] currId = [-1]
from feature_extraction.dataLoader import DataLoader from feature_extraction.tfIdf import tf_idf from scipy.sparse import * from scipy import io from scipy import * import cPickle from collections import defaultdict print "Begin Loading Data" loader = DataLoader() loader.loadUsers() loader.loadPapers() loader.loadPastPapers() loader.loadAbstracts() print "End Loading Data" print "Begin Computing TF-IDF Vectors" tfidf = tf_idf() tfidf.store_tf_idf(loader, allPapers=True) print "End Computing TF-IDF Vectors" #Rows in Matrix m = len(loader.papers) + len(loader.pastPapers) #Initial Columns in Matrix n = [100000] currRow = [0] #Term -> term id dictionary def incrementAndReturn(): currId[0] += 1
print "Loading Files" def readFile(name): f = open("savedFrames/iteration5/abstractPrediction/"+name, 'r') obj = cPickle.load(f) f.close() return obj termDict = readFile("termDict.dat") paperIds = readFile("paperIds.dat") paperMatrix = readFile("paperMatrix.mat") print "Load Data" loader = DataLoader() loader.loadPapers() loader.loadAcceptance() def randomSample(n, k): return [ int(random.random() * n) for i in range(k) ] def performCV(X, y, model, n=4): predictions = np.zeros(X.shape[0]) X = np.array(X)
from utilities.corenlp import StanfordCoreNLP from utilities import pexpect stemmer = Stemmer.Stemmer("english") def getFeatures(text): #words = stemmer.stemWords( # re.sub('[^\w\s]', ' ', text).lower().split()) words = re.sub('[^\w\s]', ' ', text).lower().split() return dict([word, True] for word in words) loader = DataLoader() loader.loadUsers() loader.loadPapers() loader.loadReviews() # dataSet = [] # totalPositive = 0.0 # totalNegative = 0.0 # totalNeutral = 0.0 # for id, review in loader.reviews.iteritems(): # if np.abs(review.overallRating) > 1: # ratings = review.ratings # reviewText = "%s %s %s" % ( # ratings["strengths"],
import warnings from feature_extraction import calcFeatures import numpy as np with warnings.catch_warnings(): warnings.simplefilter("ignore") from pandas import DataFrame from feature_extraction.dataLoader import DataLoader from feature_extraction.tfIdf import tf_idf loader = DataLoader() loader.loadAll() loader.loadClassifierAccuracy() #loader.loadAll() tfidf = tf_idf() tfidf.store_tf_idf(loader) print "Calculating Features" calcFeatures.calcUsersAccepted(loader) calcFeatures.calcAuthorsPastPapers(loader) calcFeatures.computeAverages(loader) calcFeatures.computeDistances(loader.reviews) #calcFeatures.calcWeightedPaperCount(loader) calcFeatures.calcTopConfsJoursCount(loader) print "Constructing Author Table" authorTable = []
from utilities import pexpect stemmer = Stemmer.Stemmer("english") def getFeatures(text): #words = stemmer.stemWords( # re.sub('[^\w\s]', ' ', text).lower().split()) words = re.sub('[^\w\s]', ' ', text).lower().split() return dict( [word, True] for word in words) loader = DataLoader() loader.loadUsers() loader.loadPapers() loader.loadReviews() # dataSet = [] # totalPositive = 0.0 # totalNegative = 0.0 # totalNeutral = 0.0 # for id, review in loader.reviews.iteritems(): # if np.abs(review.overallRating) > 1: # ratings = review.ratings # reviewText = "%s %s %s" % ( # ratings["strengths"],