def batch_to_file(batch, url, experiment_name, run, n_qu, n_lists, batch_n): # header = ['quid', 'question', 'example_pos', 'example_neg'] header_new = ['quid','listNr', 'description', 'exampleTrue', 'exampleFalse',\ 'triple', 'completionUrl', 'name'] dirpath = f'../prolific_input/run{run}-group_{experiment_name}/' batch_name = f'qu{n_qu}-s_qu{n_lists}-batch{batch_n}' filepath = f'{dirpath}{batch_name}.csv' pl_name = f'Agree or disagree (run{run}-{experiment_name}-batch{batch_n}-{n_qu}-{n_qu})' ### write header### if not os.path.isdir(dirpath): os.mkdir(dirpath) header_path = f'{dirpath}header.txt' if not os.path.isfile(header_path): with open(header_path, 'w') as outfile: outfile.write(','.join(header_new)) ### new_dicts = [] for d in batch: triple = f"{d['relation']}-{d['property']}-{d['concept']}" new_d = dict() new_d['quid'] = d['quid'] new_d['listNr'] = d['listNr'] new_d['description'] = d['question'] new_d['exampleTrue'] = d['example_pos'] new_d['exampleFalse'] = d['example_neg'] new_d['run'] = run new_d['subList'] = 1 new_d['completionUrl'] = url new_d['triple'] = triple new_d['name'] = pl_name new_dicts.append(new_d) to_csv(filepath, new_dicts, header=True) return filepath
def results_to_csv(results, csv_file_name): def algorithm(r): if r.configs['Selection'] == 'ACO': return r.configs['Routing'] + '+' + r.configs[ 'Selection'] + '(' + str( r.configs['AcoSelectionAlpha']) + ', ' + str( r.configs['ReinforcementFactor']) + ')' else: return r.configs['Routing'] + '+' + r.configs['Selection'] def benchmark(r): if r.configs['DataPacketTraffic'] == 'Trace': return r.props['bench'] return r.configs['DataPacketTraffic'] to_csv(csv_file_name, results, [ ('Benchmark', benchmark), ('Routing', lambda r: r.configs['Routing']), ('Selection', lambda r: r.configs['Selection']), ('Data Packet Injection Rate', lambda r: r.configs['DataPacketInjectionRate']), ('ACO Selection Alpha', lambda r: r.configs['AcoSelectionAlpha']), ('Reinforcement Factor', lambda r: r.configs['ReinforcementFactor']), ('Algorithm', algorithm), ('Max Cycles', lambda r: r.configs['MaxCycles']), ('Simulation Time (Seconds)', lambda r: r.stats['SimulationTimeInSeconds']), ('Throughput', lambda r: r.stats['Throughput']), ('Average Packet Delay', lambda r: r.stats['AveragePacketDelay']), ('Payload Throughput', lambda r: r.stats['PayloadThroughput']), ('Average Payload Packet Delay', lambda r: r.stats['AveragePayloadPacketDelay']), ])
def run(args): l = loader.DataLoader(args.dataset, args.k, args.mode, args.dataset_path, args.crowd_annotations_path, args.ground_truths_path) data, gt = l.get_data() result, accuracy = algorithms.main(args, data, gt) ind_to_question_dict = l.get_ind_to_question_dict() ind_to_annotation_dict = l.get_ind_to_annotation_dict() result_annotations = pd.DataFrame(data=result, columns=['Annotation']) result_annotations.reset_index(level=0, inplace=True) result_annotations = result_annotations.rename( columns={'index': 'Question'}) result_annotations['Question'] = result_annotations['Question'].map( ind_to_question_dict) result_annotations['Annotation'] = result_annotations['Annotation'].map( ind_to_annotation_dict) if args.print_result: print("Predictions:") print(result_annotations) if args.mode == 'test': print("Accuracy:") print(accuracy) if args.output is not None: utils.to_csv(result, args.output, ind_to_question_dict, ind_to_annotation_dict)
def main(p): train, test = utils.load(p) train['qqgeogor_jaccard-{}'.format(p)] = train.apply( lambda x: str_jaccard(x['q1'], x['q2']), axis=1) train['qqgeogor_levenshtein_1-{}'.format(p)] = train.apply( lambda x: str_levenshtein_1(x['q1'], x['q2']), axis=1) train['qqgeogor_levenshtein_2-{}'.format(p)] = train.apply( lambda x: str_levenshtein_2(x['q1'], x['q2']), axis=1) train['qqgeogor_sorensen-{}'.format(p)] = train.apply( lambda x: str_sorensen(x['q1'], x['q2']), axis=1) train['qqgeogor_set_intersection-{}'.format(p)] = train.apply( lambda x: calc_set_intersection(x['q1'], x['q2']), axis=1) test['qqgeogor_jaccard-{}'.format(p)] = test.apply( lambda x: str_jaccard(x['q1'], x['q2']), axis=1) test['qqgeogor_levenshtein_1-{}'.format(p)] = test.apply( lambda x: str_levenshtein_1(x['q1'], x['q2']), axis=1) test['qqgeogor_levenshtein_2-{}'.format(p)] = test.apply( lambda x: str_levenshtein_2(x['q1'], x['q2']), axis=1) test['qqgeogor_sorensen-{}'.format(p)] = test.apply( lambda x: str_sorensen(x['q1'], x['q2']), axis=1) test['qqgeogor_set_intersection-{}'.format(p)] = test.apply( lambda x: calc_set_intersection(x['q1'], x['q2']), axis=1) utils.to_csv(train, test, 'f103-{}'.format(p)) return
def add_new_example_props(): # Get property info path = '../data/property_info.csv' prop_dicts = read_csv(path) props_in_info = [d['property'] for d in prop_dicts] header = prop_dicts[0].keys() # Get example properties ex_files = glob.glob('../examples/*-pairs.csv') test_files = glob.glob('../data/test/*/*.csv') ex_files.extend(test_files) p_targets = ['prop_pos', 'prop_neg', 'property'] for f in ex_files: with open(f) as infile: dl = read_csv(f) for d in dl: for t in p_targets: if t in d: prop = d[t] if prop != '' and prop not in props_in_info: print(f'"{prop}" needs annotation!') new_d = dict() new_d['property'] = prop for h in header: if h not in new_d: new_d[h] = 'NEEDS INFO' if new_d not in prop_dicts: prop_dicts.append(new_d) print(f'Add info to added properties in: {path}') to_csv(path, prop_dicts)
def main(args): start = time.time() input_train = pnd.read_csv(utils.get_corr_lemm_path(args.label)) input_test = pnd.read_csv(utils.get_corr_lemm_path(args.label, test=True)) y = pnd.read_csv(utils.get_labels_path(), sep=';')[params.LABELS_COL].values X_train, X_val, y_train, y_val = train_test_split(input_train, y, test_size=0.2, random_state=42) tokenizer = Tokenizer() vectorizer = MyVectorizer(is_sparse=True) pca = TruncatedSVD() svm = SVC(random_state=42) # Caching operations to avoid repetitions cachedir = mkdtemp() pipe = Pipeline( [ ('tokenizer', tokenizer), ('vectorizer', vectorizer), # ('pca', pca), ('svm', svm) ], memory=cachedir) params_grid = dict( tokenizer__do_clustering=[True], tokenizer__n_clusters=[2, 3, 4, 5], tokenizer__max_df=[1.], vectorizer__max_df=[1.], vectorizer__max_features=[None], svm__C=[40.], svm__gamma=[0.05], svm__kernel=['rbf'], # pca__n_components=[1000, 2000, 3000, 4000], ) params_pipe = dict( tokenizer__do_clustering=True, tokenizer__n_clusters=2, # , 5, 20], tokenizer__max_df=1., # , 0.2], vectorizer__max_df=1., # , 0.05], vectorizer__max_features=None, svm__C=40., # , 100., 10., 1.], svm__gamma=0.05, # , 0.05, 0.1], svm__kernel='rbf', # pca__n_components=3000, ) pipe.set_params(**params_pipe) pipe.fit(input_train, y) print(pipe.score(X_val, y_val)) predictions = pipe.predict(input_test) utils.to_csv( predictions, './results/%s/svm/y_pred.csv' % datetime.strftime(datetime.now(), "%Y_%m_%dT%H_%M_%S")) # grid_search = GridSearchCV(pipe, n_jobs=3, cv=3, param_grid=params_grid, verbose=3) # grid_search.fit(X_train, y_train) # utils.write_results(grid_search) print("It took %.3f" % (time.time() - start))
def to_file(self, overwrite_existing=True): # filepath = 'questions/run_TEST-all-restrict_True.csv' filepath = f"../questions/run{self.run}-all-restricted_{self.restrict}.csv" if os.path.isfile(filepath) and overwrite_existing == False: print('ATTENTION: run already exists. If you want to overwrite, set overwrite_exsiting to True.') else: utils.to_csv(filepath, self.questions) print(f'{len(self.questions)} questions written to: {filepath}')
def multi(p): train_ = train.copy() test_ = test.copy() ix = list(range(0,3000,300))[p] words_ = words[ix:ix+300] for w in words_: train_['BOW_'+w] = train_['q1'].map(lambda x: w in x.split())*1 + train_['q2'].map(lambda x: w in x.split())*1 test_['BOW_'+w] = test_['q1'].map(lambda x: w in x.split())*1 + test_['q2'].map(lambda x: w in x.split())*1 utils.to_csv(train_, test_, 'f009-word-{0}'.format(p))
def correct_check(path, question_dicts): path_backup = path.replace('.csv', '-backup.csv') to_csv(path_backup, question_dicts) for d in question_dicts: if d['relation'] == ( 'affording_activity') and d['property'].startswith('made_of'): print(d['question']) d['question'] = d['question'].replace('I know that being (a/an)', 'I know that being') print(d['question']) to_csv(path, question_dicts)
def add_quot_marks(question_dicts, path, rel='creative'): relevant_keys = ['question', 'example_pos', 'example_neg'] for d in question_dicts: if d['relation'] == rel: for k in relevant_keys: phrase = d[k] print(phrase) phrase = phrase.replace('say (a/an)', 'say ``(a/an)') phrase = phrase.replace(', but I', '", but I') print(phrase) d[k] = phrase to_csv(path, question_dicts)
def get_tfidf_values(self, page_id_list, term_id_list): """ Returns a list of (PageID, TermID, Tfidf) """ v1 = to_csv(page_id_list) v2 = to_csv(term_id_list) self._cur.execute(""" SELECT PageID, TermID, Tfidf FROM TfidfValues WHERE PageID IN (%s) AND TermID IN (%s); """ % (v1, v2)) return self._cur.fetchall()
def get_term_occurrences(self, page_id_list, term_id_list): """ Returns a list of (PageID, TermID, Counter) """ v1 = to_csv(page_id_list) v2 = to_csv(term_id_list) self._cur.execute(""" SELECT PageID, TermID, Counter FROM TermOccurrences WHERE PageID IN (%s) AND TermID IN (%s); """ % (v1, v2)) return self._cur.fetchall()
def update_examples(question_dicts, label, ex_dict, rel): for d in question_dicts: if d[f'concept_{label}'] == ex_dict['concept_old'] \ and d[f'prop_{label}'] == ex_dict['prop_old'] and d['relation'] == rel: ex = d[f'example_{label}'] print('old', ex) d[f'concept_{label}'] = ex_dict['concept_new'] d[f'prop_{label}'] = ex_dict['prop_new'] new_ex = ex.replace(ex_dict['concept_old'], ex_dict['concept_new']) new_ex = new_ex.replace(ex_dict['prop_old'], ex_dict['prop_new']) if new_ex[-1] != '.': new_ex = new_ex + '.' d[f'example_{label}'] = new_ex print('new', d[f'example_{label}']) to_csv(path, question_dicts)
def get_page_data(self, page_id_list): """ Returns a list of (PageID, PageName, Length) """ var_string = to_csv(page_id_list) self._cur.execute(""" SELECT PageID, PageName, Length FROM Pages WHERE PageID IN (%s); """ % var_string) return self._cur.fetchall()
def get_document_frequencies(self, term_id_list): """ Returns a list of (TermID, DocumentFrequency) """ var_string = to_csv(term_id_list) self._cur.execute(""" SELECT TermID, DocumentFrequency FROM DocumentFrequencies WHERE TermID IN (%s) """ % var_string) return self._cur.fetchall()
def get_tfidf_totals(self, page_id_list): """ Returns a list of (PageID, Total) """ var_string = to_csv(page_id_list) self._cur.execute(""" SELECT PageID, Total FROM TfidfTotals WHERE PageID IN (%s); """ % var_string) return self._cur.fetchall()
def get_term_names(self, term_id_list): """ Returns a list of (TermID, TermName) """ var_string = to_csv(term_id_list) self._cur.execute(""" SELECT TermID, TermName FROM Terms WHERE TermID IN (%s); """ % var_string) return self._cur.fetchall()
def get_page_ids(self, page_name_list): """ Returns a list of (PageName, PageID) """ var_string = to_csv(page_name_list) self._cur.execute(""" SELECT PageName, PageID FROM Pages WHERE PageName IN (%s); """ % var_string) return self._cur.fetchall()
def get_relations(run): dicts = [] filepath = f'../templates/relation_overview_run{run}.csv' collection_relation_question_dict, level_relation_dict = read_template(run) for l, rels, in level_relation_dict.items(): l = int(l) if l == 1: l_name = 'all' elif l == 2: l_name = 'some' elif l == 3: l_name = 'few' for r in rels: d = dict() if r == 'creative': d['level'] = 'creative' else: d['level'] = l_name d['relation'] = r dicts.append(d) to_csv(filepath, dicts, header = True)
def lambda_handler(event, context): "Lambda entry point" with open('config.json') as data_file: CONFIG = json.load(data_file) #Read the task completion sheet and filter current quarter data spreadsheet_data, headers = utils.read_sheet() filtered_data = utils.filter_current_quarter_data(spreadsheet_data) #Read the data into a csv complete_data = utils.to_csv(spreadsheet_data, headers) current_quarter_data = utils.to_csv(filtered_data, headers) #Upload the csv files to s3 bucket utils.upload_to_s3(complete_data, CONFIG['complete_sheet_s3_key']) utils.upload_to_s3(current_quarter_data, CONFIG['current_quarter_s3_key']) #Prepare the data to initiate transfer to dynamodb prepared_complete_data = utils.prepare_data( CONFIG['complete_sheet_s3_key']) prepared_quarter_data = utils.prepare_data( CONFIG['current_quarter_s3_key']) #Store the complete task compeltion sheet data if the dynamodb is empty utils.migrate_to_dynamodb(prepared_complete_data) #Update the dynamodb with edits to current quarter data utils.update_dynamodb(prepared_quarter_data) return "Read task completion sheet and populated dynamodb"
def get_page_links(self, page_id_list): """ Returns a list of (PageID, TargetPageID, LinkCounter) """ # Attempting to speed this up with a TargetID IN will # not work because there is no Index available on TargetID var_string = to_csv(page_id_list) self._cur.execute(""" SELECT PageID, TargetPageID, Counter FROM PageLinks WHERE PageID IN (%s); """ % (var_string, )) return self._cur.fetchall()
def fetch_and_compose(sha_issuekey): sha, issue_key = sha_issuekey try: res = fetch_issue(issue_key) except: # ignore communication failure return if not RAW: res = get_filtered(res) res['commit'] = sha if CSV: res = utils.to_csv([v for _, v in sorted(res.items())]) else: res = json.dumps(res, sort_keys=True, separators=(',', ':')) utils.output(res)
di = {c:c+suf for c in df.columns if '_from_the1owl' in c} df = df.rename(columns=di) col = [c for c in df.columns if '_from_the1owl' in c or 'id' in c] return df[col] #============================================================================== train, test = utils.load(0) train = main(train, '').fillna(-1) test = main(test, '').fillna(-1) utils.to_csv(train, test, 'f102-0') train, test = utils.load(1) train = main(train, '-stem').fillna(-1) test = main(test, '-stem').fillna(-1) utils.to_csv(train, test, 'f102-1') train, test = utils.load(2) train = main(train, '-stop').fillna(-1) test = main(test, '-stop').fillna(-1) utils.to_csv(train, test, 'f102-2') train, test = utils.load(3) train = main(train, '-stst').fillna(-1) test = main(test, '-stst').fillna(-1) utils.to_csv(train, test, 'f102-3')
df['q2_large_share_ratio'] = df['q2_large_share'] / df['q2_large_len'] col = df.dtypes[df.dtypes != 'object'].index.tolist() return df[col] """ df = train.sample(999) """ #============================================================================== # main #============================================================================== train = main(train) test = main(test) utils.to_csv(train, test, 'f008') print( """#============================================================================== # SUCCESS !!! {} #============================================================================== """.format(__file__))
def add_page_index(terms, page, intra_links): term_list = Counter(terms) doc_length = sum(term_list.values()) if doc_length >= MIN_PAGE_LENGTH: cur.execute(""" SELECT PageID, Processed FROM Pages WHERE PageName=%s; """, (page, )) rows = cur.fetchone() cur.fetchall() if rows: page_id, processed = rows if processed: return # No decent way to resolve this conflict (Issue #76) else: cur.execute(""" UPDATE Pages SET PageName=%s, Length=%s, Processed=TRUE WHERE PageID=%s; """, (page, doc_length, page_id)) else: # On duplicate command is a hack to prevent a select statement cur.execute(""" INSERT IGNORE INTO Pages (PageName, Length, Processed) VALUES (%s, %s, TRUE) """, (page, doc_length)) page_id = cur.lastrowid # There was a duplicate entry case if page_id <= 0: return filtered_term_list = [a for (a, b) in term_list.items() if b > 1] var_string = to_csv(filtered_term_list, separate=True) cur.execute(""" INSERT IGNORE INTO Terms (TermName) VALUES %s; """ % var_string) var_string = to_csv(filtered_term_list, separate=False) cur.execute(""" SELECT TermID, TermName FROM Terms WHERE TermName IN (%s); """ % var_string) term_results = cur.fetchall() if term_results: termids = [(tid, term_list[name]) for (tid, name) in term_results] var_string = u'({},%s,%s),'.format(page_id) * len(term_results) var_string = var_string[:-1] cur.execute(""" INSERT INTO TermOccurrencesTemp (PageID, TermID, Counter) VALUES %s; """ % var_string, itertools.chain.from_iterable(termids)) var_string = u'' page_links = {} for link, counter in intra_links.items(): cur.execute(""" INSERT INTO Pages (PageName, Processed) VALUES (%s, FALSE) ON DUPLICATE KEY UPDATE PageID=LAST_INSERT_ID(PageID); """, (link, )) # Handles conflicts gracefully using a dictionary target_page_id = cur.lastrowid if target_page_id not in page_links: page_links[target_page_id] = 0 page_links[target_page_id] += counter # Generate the link pairs from the built dictionary for target_page_id, counter in page_links.items(): var_string += u'(%d,%d,%d),' % (page_id, target_page_id, counter) # Perform one large batch insert rather than individual inserts if var_string: var_string = var_string[:-1] cur.execute(""" INSERT INTO PageLinks (PageID, TargetPageID, Counter) VALUES %s; """ % var_string)
def opinions_per_month_per_city(): return session.query(func.date_part('month',Opinion.date),City.name, func.count('*')).select_from(Opinion).join(Hotel).join(Address).join(City).group_by(City.name, func.date_part('month',Opinion.date)).all() def make_histogram(iterable,low,high,bins,shift): step = (high - low + 0.0) / bins dist = Counter((float(x) - low + shift) // step for x in iterable) return [dist[b] for b in range(bins)] subq1 = session.query(City.name,City.id,func.count(Hotel.id).label('count')).select_from(Opinion).join(Hotel).join(Address).join(City).group_by(City.id,Hotel.id).subquery() subq2 = session.query(Hotel.stars,func.count(Hotel.id).label('count')).select_from(Opinion).join(Hotel).group_by(Hotel.id).subquery() result = opinions_per_tag() utils.to_csv(result, "opinions_per_tag.out") result = hotel_dist(Hotel.stars) utils.to_csv(result, "hotel_dist_stars.out") result = hotel_dist(Hotel.price_level) utils.to_csv(result, "hotel_dist_price.out") result = avg_ops(subq1.c.name,subq1) utils.to_csv(result, "avg_ops_hotel_cities.out") result = avg_ops(subq2.c.stars,subq2) utils.to_csv(result, "avg_ops_hotel_stars.out") result = city_hotel_dist(Hotel.stars) utils.to_csv(result, "city_hotel_stars.out")
lambda x: fuzz.partial_token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1) df['fuzz_token_set_ratio' + suf] = df.apply( lambda x: fuzz.token_set_ratio(str(x['q1']), str(x['q2'])), axis=1) df['fuzz_token_sort_ratio' + suf] = df.apply( lambda x: fuzz.token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1) return df #============================================================================== train, test = utils.load(0) train = main(train, '') test = main(test, '') utils.to_csv(train, test, 'f100-0') del train, test gc.collect() train, test = utils.load(1) train = main(train, '-stem') test = main(test, '-stem') utils.to_csv(train, test, 'f100-1') del train, test gc.collect() train, test = utils.load(2) train = main(train, '-stop') test = main(test, '-stop') utils.to_csv(train, test, 'f100-2') del train, test
def print_top_terms(clf_descr, category, cpairs): to_csv(os.path.join(opts.output_dir, "feat_%s_%s.csv" % (clf_descr, category)), [(re.sub('^[A-Za-z]+__', '', t).encode("utf-8"), c) for t, c in cpairs])
def fit(self, ratings: np.ndarray, validset: np.ndarray = None, epochs: int = 20) -> None: """ Arguments --------- ratings : np.ndarray ratings matrix for training (i.e. train dataset) validset : np.ndarray validation dataset epochs : int number of iterations """ # average of ratings self.mean_rating = np.mean(ratings[:, 2]) # minimum of ratings if self.min_rating > np.min(ratings[:, 2]): self.min_rating = np.min(ratings[:, 2]) best_loss = 0 # initialize user gradient & momentum user_feature_grads, user_feature_mom = ( np.zeros((self.n_user, self.n_feature)), np.zeros((self.n_user, self.n_feature)), ) # initialize item gradient & momentum item_feature_grads, item_feature_mom = ( np.zeros((self.n_item, self.n_feature)), np.zeros((self.n_item, self.n_feature)), ) batch_num = int(np.ceil(ratings.shape[0] / self.batch_size)) start_time = time.time() self.train_losses = [] self.valid_losses = [] for epoch in range(1, epochs + 1): # dataset shuffling np.random.shuffle(ratings) for batch_idx in range(batch_num): start_idx = int(batch_idx * self.batch_size) end_idx = int((batch_idx + 1) * self.batch_size) batch = ratings[start_idx:end_idx] # compute gradient user_ids = batch.take(0, axis=1).astype(int) item_ids = batch.take(1, axis=1).astype(int) u_features = self.user_features.take(user_ids, axis=0) i_features = self.item_features.take(item_ids, axis=0) outputs = np.sum(u_features * i_features, axis=1) errs = outputs - (batch.take(2, axis=1) - self.mean_rating) err_mat = np.tile(2 * errs, (self.n_feature, 1)).T user_grads = i_features * err_mat + self.reg * u_features item_grads = u_features * err_mat + self.reg * i_features # clear all gradients user_feature_grads.fill(0.0) item_feature_grads.fill(0.0) for idx in range(batch.shape[0]): user_id, item_id, rating = batch[idx] user_id, item_id = int(user_id), int(item_id) user_feature_grads[user_id, :] += user_grads[idx] item_feature_grads[item_id, :] += item_grads[idx] # update momentum user_feature_mom = (self.momentum * user_feature_mom + self.learning_rate * user_feature_grads) item_feature_mom = (self.momentum * item_feature_mom + self.learning_rate * item_feature_grads) # update user/item matrix self.user_features -= user_feature_mom self.item_features -= item_feature_mom # rmse train loss train_preds = self.predict(ratings[:, :2]) train_loss = rmse(train_preds, ratings[:, 2]) self.train_losses.append(train_loss) # save losses if validset is None: print( f"ellapse: {time_since(start_time)} | epoch: {epoch:03d} | train RMSE: {train_loss:.6f}" ) else: valid_preds = self.predict(validset[:, :2]) valid_loss = rmse(valid_preds, validset[:, 2]) self.valid_losses.append(valid_loss) print( f"ellapse: {time_since(start_time)} | epoch: {epoch:03d} | train RMSE: {train_loss:.6f} | valid RMSE: {valid_loss:.6f}" ) # save csv if best_loss == 0 or valid_loss < best_loss: best_loss = valid_loss result_dir = f"{self.save_dir}/results" weight_dir = f"{self.save_dir}/weights" to_csv(f"{result_dir}/output_val_pmf.csv", valid_preds, validset, header=True) self._save_model(weight_dir) print(f"save result and model weights at {best_loss}") return None
return df #============================================================================== model = gensim.models.KeyedVectors.load_word2vec_format( '../nlp_source/GoogleNews-vectors-negative300.bin.gz', binary=True) norm_model = gensim.models.KeyedVectors.load_word2vec_format( '../nlp_source/GoogleNews-vectors-negative300.bin.gz', binary=True) norm_model.init_sims(replace=True) train, test = utils.load(0) train = main(train, '') test = main(test, '') utils.to_csv(train, test, 'f003-0') del train, test gc.collect() train, test = utils.load(1) train = main(train, '-stem') test = main(test, '-stem') utils.to_csv(train, test, 'f003-1') del train, test gc.collect() train, test = utils.load(2) train = main(train, '-stop') test = main(test, '-stop') utils.to_csv(train, test, 'f003-2') del train, test
SGDClassifier(loss='log', alpha=1e-4, n_iter=50, penalty=penalty), "SGD_" + penalty.upper() + "_std")) # print('=' * 80) # print("SGD with elasticnet penalty") # results.append(benchmark( # SGDClassifier(loss='hinge', alpha=1e-4, n_iter=50, penalty='elasticnet', # l1_ratio=0.10), # "SGD (elasticnet penalty)")) print('=' * 80) print("SGD L1 feature selection") clf = with_l1_feature_selection( SGDClassifier, loss='log', alpha=0.00021, n_iter=10 )(loss='log', alpha=.0001, n_iter=50) results.append(benchmark(clf, "SGD_L1_featsel")) # print('=' * 80) # print("Radial kernal svc") # results.append(benchmark(SVC(kernel='rbf'))) if opts.output_roc: print("Writing ROC curve data") to_csv(opts.output_roc, all_roc_data) if opts.output: print("Writing scores data") to_csv(opts.output, results)
utils.to_csv(train_, test_, 'f009-word-{0}'.format(p)) pool = mp.Pool(total_proc) callback = pool.map(multi, range(total_proc)) #============================================================================== # ents #============================================================================== train, test = utils.load(3) files = sorted(glob('../nlp_source/ent*')) for f in files: words = pd.read_csv(f).head(30).word.tolist() for w in words: train['ent_'+w] = train['q1'].map(lambda x: w.lower() in x.lower().split())*1 + train['q2'].map(lambda x: w.lower() in x.lower().split())*1 test['ent_'+w] = test['q1'].map(lambda x: w.lower() in x.lower().split())*1 + test['q2'].map(lambda x: w.lower() in x.lower().split())*1 utils.to_csv(train, test, 'f009-ent') print("""#============================================================================== # SUCCESS !!! {} #============================================================================== """.format(__file__))
def get_basic_csv(): """获取首页基础信息""" servants = get_basic_info_of_servants(as_raw=True) to_csv('data/servants.csv', servants)
def update_log(new_log_dict): path = '../task_set_up/experiment_log.csv' log_dicts = read_csv(path) log_dicts.append(new_log_dict) to_csv(path, log_dicts) print(f'updated log: {path}')