def make_us_states_name_db(): def split(l): splits = l.split(",") return splits[1], splits[3].lower() root_folder = "data/us_states" pkl_file = US_STATES_GENDER_FILE db = {} for f_name in os.listdir(root_folder): f_name = "%s/%s" % (root_folder, f_name) with open(f_name) as f: print(f_name) for line in f.readlines(): gender, name = split(line) node = db.get(name, None) if node is None: node = O() node.name = name node.females = 0 node.males = 0 if gender == 'F': node.females += 1 elif gender == 'M': node.males += 1 db[name] = node with open(pkl_file, "wb") as f: pkl.dump(db, f, pkl.HIGHEST_PROTOCOL) return db
def build_graph(index, train_x, train_y, cite_map, use_references=True, from_cache=True): if use_references: cached = "cache/graphs/%d_ref.pkl" % index else: cached = "cache/graphs/%d.pkl" % index if os.path.isfile(cached) and from_cache: with open(cached) as f: return cPkl.load(f) vocab_file = 'cache/vocabulary/%d.pkl' % index vocabulary, reverse_vocabulary = construct_vocabulary(train_x, vocab_file) vocabulary_words = set(vocabulary.keys()) analyze = predict.analyzer() doc_map = {} for x, y in zip(train_x, train_y): tokens = set(analyze(x.raw)).intersection(vocabulary_words) # add_tokens(tokens, nodes) doc = Doc(x.id, tokens, y) doc_map[x.id] = doc edges = np.zeros((VOCAB_SIZE, VOCAB_SIZE), dtype=np.int16) for i, x in enumerate(train_x): if i % 1000 == 0: print(i) tokens = list(doc_map[x.id].tokens) make_self_edges(tokens, edges, vocabulary) if use_references: references = cite_map.get(x.id, []) for reference in references: if reference not in doc_map: # belongs to test set continue make_edges(tokens, list(doc_map[reference].tokens), edges, vocabulary) word_network = O(doc_map=doc_map, edges=edges) with open(cached, "wb") as f: cPkl.dump(word_network, f, cPkl.HIGHEST_PROTOCOL) return word_network
def make_indian_name_db(): def split(l): splits = l.split() return int(splits[2]), int(splits[3]), splits[4].lower() inp_file = "data/ind_names.txt" db = {} with open(inp_file) as f: index = 0 for line in f.readlines(): index += 1 if index % 1000 == 0: print("Line : %d", index) males, females, name = split(line) node = db.get(name, None) if node is None: node = O() node.name = name node.females = 0 node.males = 0 node.females += females node.males += males db[name] = node pkl_file = INDIAN_GENDER_FILE with open(pkl_file, "wb") as f: pkl.dump(db, f, pkl.HIGHEST_PROTOCOL) return db
def make_name_db(): root_folder = "data/us_names" pkl_file = US_GENDER_FILE db = {} for f_name in os.listdir(root_folder): f_name = "%s/%s" % (root_folder, f_name) with open(f_name) as f: print(f_name) for line in f.readlines(): [name, gender, count] = line.split(",") name = name.lower() node = db.get(name, None) if node is None: node = O() node.name = name node.females = 0 node.males = 0 if gender == 'F': node.females += int(count) elif gender == 'M': node.males += int(count) db[name] = node with open(pkl_file, "wb") as f: pkl.dump(db, f, pkl.HIGHEST_PROTOCOL) return db
def avg_score(metrics_arr): accuracies, precisions, recalls, f_scores = [], [], [], [] for metrics in metrics_arr: accuracies.append(metrics.accuracy) precisions.append(metrics.precision) recalls.append(metrics.recall) f_scores.append(metrics.f_score) score = Metrics() score.accuracy = O(median=Metrics.median(accuracies), iqr=Metrics.iqr(accuracies)) score.precision = O(median=Metrics.median(precisions), iqr=Metrics.iqr(precisions)) score.recall = O(median=Metrics.median(recalls), iqr=Metrics.iqr(recalls)) score.f_score = O(median=Metrics.median(f_scores), iqr=Metrics.iqr(f_scores)) return score
def avg_score(metrics_arr): accuracies, precisions, recalls, f_scores, specificities = [], [], [], [], [] pre_reject_misseds = [] for metrics in metrics_arr: accuracies.append(metrics.accuracy) precisions.append(metrics.precision) recalls.append(metrics.recall) f_scores.append(metrics.f_score) specificities.append(metrics.specificity) pre_reject_misseds.append(metrics.pre_reject_missed / (metrics.pre_reject + metrics.EPS)) score = O() score.accuracy = O(median=Metrics.median(accuracies), iqr=Metrics.iqr(accuracies)) score.precision = O(median=Metrics.median(precisions), iqr=Metrics.iqr(precisions)) score.recall = O(median=Metrics.median(recalls), iqr=Metrics.iqr(recalls)) score.f_score = O(median=Metrics.median(f_scores), iqr=Metrics.iqr(f_scores)) score.specificity = O(median=Metrics.median(specificities), iqr=Metrics.iqr(specificities)) score.pre_reject_missed = O(median=Metrics.median(pre_reject_misseds), iqr=Metrics.iqr(pre_reject_misseds)) return score
def get_venues(): db = DB.get() cur = db.cursor() cur.execute('SELECT * FROM venues') venues = OrderedDict() for row in cur.fetchall(): venue = O() venue.id = str(row[0]) venue.acronym = row[1] venue.name = row[2] venue.impact = int(row[3]) venue.is_conference = True if row[4] == 1 else False venues[venue.id] = venue DB.close() return venues
def vectorize(papers, iterations=ITERATIONS): miner, graph, lda_model, vocab = get_graph_lda_data(iterations=iterations) # vectorizer = text.CountVectorizer(stop_words=STOP_WORDS, token_pattern=TOKEN_PATTERN) docs = [ paper.abstract if paper.abstract is not None and paper.abstract != 'None' else paper.title for paper in papers ] doc_2_vec = miner.vectorizer.transform(docs) doc_2_vec_array = doc_2_vec.toarray() transformed = lda_model.transform(doc_2_vec_array) report(lda_model, vocab) for paper, t, d_2_v in zip(papers, transformed, doc_2_vec_array): paper.transformed = t paper.doc_2_vec = d_2_v return O(miner=miner, graph=graph, lda_model=lda_model, vocab=vocab, doc_2_vec=doc_2_vec)
plt.savefig(fig_name, bbox_inches='tight') plt.clf() # Settings for 10 rows and 5 columns settings_10_5 = O( fig_size=(8, 8), col_axes=[ 0.3, # col dendo left 0.81, # col dendo bottom 0.36, # col dendo width 0.15 ], # col dendo height row_axes=[ 0.0, # row dendo left 0.055, # row dendo bottom 0.23, # row dendo width 0.69 ], # row dendo height plot_axes=[ 0.10, # hm left 0.05, # hm bottom 0.7, # hm width 0.7 ], # hm height ) # Settings for 10 rows and 4 columns settings_10_4 = O( fig_size=(8, 8), col_axes=[
# logging.getLogger('lda').setLevel(logging.ERROR) # GRAPHS = { # "v5": "data/citemap_v10.csv", # "v2": "data/citemap_v4.csv" # } GRAPH_CSV = "data/citemap_v10.csv" # For 11 TOPICS ALPHA = 0.22359 BETA = 0.53915 ITERATIONS = 100 MIN_DIVERSITY_SCORE = 0.075 THE = O() THE.permitted = "all" # conference/journal/all THE.version = "v5" THE.use_numeric = False THE.random_state = 0 THE.IGNORE_VENUES = {"v5": set(), "v2": {"ICPC", "MDLS", "SOSYM", "SCAM"}} STOP_WORDS = text.ENGLISH_STOP_WORDS.union([ 'software', 'engineering', 'paper', 'study', 'based', 'results', 'approach', 'case', 'workshop', 'international', 'research', 'conference', 'introduction', 'editors', 'article', 'issue', 'month', 'copyright', 'special', 'used', 'using', 'use', 'studies', 'review', 'editorial', 'report', 'book', 'ieee', 'published', 'science', 'column', 'author', 'proposed', 'icse', 'article', 'year', 'articles', 'page', '2000', '2004', 'papers', 'computer', 'held', 'editor' ])
import sys import os sys.path.append(os.path.abspath(".")) sys.dont_write_bytecode = True __author__ = "bigfatnoob" from utils.lib import O ROOT_SCOPE = "__ROOT__" VAR_TYPE = O(GLOBAL="global", LOCAL="local", ARG="arg", VARARG="vararg", KWARG="kwarg") SCOPE_SEPARATOR = "->" PRIMITIVES = {'int', 'long', 'float', 'str', 'bool'} GENERATED_PREFIX = "generated_py_" TEMPORARY_PREFIX = "tmp_py_" FUNCTION_PREFIX = "func_" METHOD_WAIT_TIMEOUT = 1
# For 11 TOPICS N_TOPICS = 11 ALPHA = 0.22359 BETA = 0.53915 ITERATIONS = 100 # TOPICS = ["TPC %d" % d for d in range(11)] TOPICS = [ "Design", "Testing", "Modelling", "Mobile", "Energy", "Defects", "SourceCode", "WebApps", "Configuration", "Developer", "Mining" ] TOPIC_THRESHOLD = 3 dendo_11_settings = O( fig_size=(6, 8), col_axes=[0.25, 0.65, 0.50, 0.11], row_axes=[0.0, 0.215, 0.2, 0.375], plot_axes=[0.25, 0.1, 0.63, 0.6], ) dendo_14_settings = O( fig_size=(8, 8), col_axes=[0.25, 0.65, 0.50, 0.11], row_axes=[0.0, 0.2, 0.21, 0.4], plot_axes=[0.25, 0.1, 0.63, 0.6], ) dendo_16_settings = O( fig_size=(8, 8), col_axes=[0.25, 0.65, 0.50, 0.11], row_axes=[0.0, 0.225, 0.21, 0.35], plot_axes=[0.25, 0.1, 0.63, 0.6],
from __future__ import print_function, division import sys import os sys.path.append(os.path.abspath(".")) sys.dont_write_bytecode = True from utils.lib import O _dend_7_8 = O(fig_size=(8, 8), col_axes=[0.3, 0.6, 0.5, 0.16], row_axes=[0.0, 0.13, 0.21, 0.45], plot_axes=[0.3, 0.05, 0.63, 0.6]) _dend_7_9 = O(fig_size=(8, 8), col_axes=[0.25, 0.6, 0.51, 0.18], row_axes=[0.0, 0.15, 0.21, 0.4], plot_axes=[0.25, 0.05, 0.63, 0.6]) _dend_7_13 = O(fig_size=(8, 8), col_axes=[0.25, 0.52, 0.51, 0.18], row_axes=[0.0, 0.21, 0.22, 0.28], plot_axes=[0.25, 0.05, 0.63, 0.6]) _dend_7_14 = O(fig_size=(8, 8), col_axes=[0.25, 0.52, 0.51, 0.18], row_axes=[0.0, 0.22, 0.21, 0.26], plot_axes=[0.25, 0.05, 0.63, 0.6]) _dend_11_18 = O(fig_size=(8, 8), col_axes=[0.3, 0.63, 0.63, 0.18], row_axes=[0.0, 0.23, 0.19, 0.39], plot_axes=[0.3, 0.05, 0.63, 0.6])
from algorithms.parallel.de.de import DE as DE_P from mpi4py import MPI from time import clock, sleep from utils.lib import O, report import utils.sk as sk from problems.pom3.pom3a import POM3A from problems.pom3.pom3b import POM3B from problems.pom3.pom3c import POM3C from problems.pom3.pom3d import POM3D from problems.xomo.xomo import XOMO COMM = MPI.COMM_WORLD RANK = COMM.rank SIZE = COMM.size settings = O(runs=20) def _run_parallel(): model = DTLZ2(3) opt = DE_P(model) times, convs, dives = [], [], [] for i in range(settings.runs): print(i) start = clock() goods = DE_P.run(opt, id=i) if RANK == 0: times.append(clock() - start) convs.append(opt.convergence(goods)) dives.append(opt.diversity(goods)) if RANK == 0: