def _papers_citations_number_by_year_sframe(without_self_citation=True): """ Get papers total number of citation in each year :param without_self_citation: if True calculate only non-self citations, other calculate with self-citations :return: SFrame with a column that contains citations_dict by year """ logger.info("Creating Paper Citations by Year (without_self_citation=%s)" % without_self_citation) ref_sf = tc.load_sframe(EXTENDED_PAPER_REFERENCES_SFRAME) if without_self_citation: ref_sf = ref_sf[ref_sf['self citation'] == 0] sf = tc.load_sframe(PAPERS_SFRAME)["Paper ID", "Paper publish year"] sf = ref_sf.join(sf, on="Paper ID") g = sf.groupby(["Paper reference ID", "Paper publish year"], {"Citation Number": agg.COUNT()}) g = g.rename({ "Paper publish year": "Year", "Paper reference ID": "Paper ID" }) g['Citation by Year'] = g.apply(lambda r: (r["Year"], r["Citation Number"])) h = g.groupby( 'Paper ID', {'Citation by Years': tc.aggregate.CONCAT('Citation by Year')}) if without_self_citation: h['Total Citations by Year without Self Citations'] = h[ 'Citation by Years'].apply( lambda l: _get_total_citation_by_year(l)) else: h['Total Citations by Year'] = h['Citation by Years'].apply( lambda l: _get_total_citation_by_year(l)) h = h.remove_column("Citation by Years") return h
def create_aminer_mag_links_by_doi_sframe(): """ Create Links Sframe that match papers from the MAG dataset with papers from the AMiner dataset based on the papers DOI :return: """ if os.path.isdir(AMINER_MAG_JOIN_SFRAME): return sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME) g1 = sf.groupby('Paper Document Object Identifier (DOI)', {'Count': agg.COUNT()}) s1 = set(g1[g1['Count'] > 1]['Paper Document Object Identifier (DOI)']) sf = sf[sf['Paper Document Object Identifier (DOI)'].apply( lambda doi: doi not in s1)] sf.materialize() sf2 = tc.load_sframe(AMINER_PAPERS_SFRAME) g2 = sf2.groupby('doi', {'Count': agg.COUNT()}) s2 = set(g2[g2['Count'] > 1]['doi']) sf2 = sf2[sf2['doi'].apply(lambda doi: doi not in s2)] sf2.materialize() j = sf.join(sf2, {'Paper Document Object Identifier (DOI)': 'doi'}) j['title_len'] = j['title'].apply(lambda t: len(t)) j['title_len2'] = j['Original paper title'].apply(lambda t: len(t)) j = j[j['title_len'] > 0] j = j[j['title_len2'] > 0] j = j.rename({"Paper ID": "MAG Paper ID", "id": "Aminer Paper ID"}) j = j.remove_columns(['title_len', 'title_len2']) j.save(AMINER_MAG_JOIN_SFRAME)
def coauthors_links_sframe(self): if self._co_authors_links is not None: return self._co_authors_links if os.path.isdir(CO_AUTHORSHIP_LINK_SFRAME): self._co_authors_links = tc.load_sframe(CO_AUTHORSHIP_LINK_SFRAME) else: self._co_authors_links = tc.load_sframe( CO_AUTHORSHIP_LINK_S3_SFRAME) return self._co_authors_links
def _get_all_papers_sframe(self): """ Return SFrame with all the papers published in the venue :return: Papers SFrame with all the papers details that were published in the venue :rtype tc.SFrame @note: The SFrame object was created by academic_parser.create_venue_papers_sframe """ if self.venue_type == VenueType.journal: return tc.load_sframe( "%s/%s.sframe" % (JOURNALS_PAPERS_SFRAMES_DIR, self._venue_id)) elif self.venue_type == VenueType.conference: return tc.load_sframe( "%s/%s.sframe" % (CONFERENCES_PAPERS_SFRAMES_DIR, self._venue_id))
def classify_page(): #try: data_id = request.args.get('data_id') my_data = UserData.query.filter_by(id=data_id).first() my_model = TrainedModel() form = TrainModelForm(request.form, obj=my_model) data_frame = tc.load_sframe(my_data.sname) data_frame = tc.load_sframe(my_data.sname) target = None cols = [] display_cols = [] names=data_frame.column_names() types=data_frame.column_types() for x in range(0, names.__len__()): cols.append(str(names[x])) if request.method == 'POST': target = request.form['target'] data_frame = data_frame.dropna(str(target), how="all") orig_data = data_frame[str(target)] norig_data = orig_data.to_numpy() classes = [] for data in norig_data: appended = False for x in range(1, int(request.form['num_brackets'])+1): if float(data) >= float(request.form['lrange_' + str(x)]) and float(data) <= float(request.form['urange_' + str(x)]): print(request.form['class_' + str(x)]) classes.append(request.form['class_' + str(x)]) appended = True continue if appended == False: classes.append("unknown") data_frame = safely_add_col(str(request.form['field']), classes, data_frame) fwd_id = save_data(my_data, request.form['name'], data_frame) flash('Successfully transformed the data set!', 'success') return redirect(url_for('data.data_details_page', data_id=fwd_id)) return render_template('pages/data/transforms/classifier.html', my_data=my_data, form=form, data_frame=data_frame, names=names, types=types, target=target, cols=cols)
def test_real_data(self): """ This test is excluded from regular build process. Only used for manual verification. """ train_path = _os.path.join(_lfs, 'gl-internal', 'internal-testdata', 'traindata3916.sframe') test_path = _os.path.join(_lfs, 'gl-internal', 'internal-testdata', 'testdata3916.sframe') train_data = tc.load_sframe(train_path) test_data = tc.load_sframe(test_path) m = tc.boosted_trees_regression.create(train_data, target='is_cv', max_iterations=1, validation_set=None, max_depth=9) self._check_json_model_predict_consistency(m, test_data)
def split_page(): try: data_id = request.args.get('data_id') my_data = UserData.query.filter_by(id=data_id).first() my_model = TrainedModel() form = TrainModelForm(request.form, obj=my_model) data_frame = tc.load_sframe(my_data.sname) if request.method == 'POST': training_set,test_set = data_frame.random_split(float(request.form['percent']),seed=0) save_data(my_data, request.form['train'], training_set) save_data(my_data, request.form['test'], test_set) flash('Successfully created train/test split for ' + my_data.name + '!', 'success') return redirect(url_for('main.my_project_page', project_id=my_data.project_id)) return render_template('pages/data/transforms/split.html', my_data=my_data, form=form, data_frame=data_frame) except Exception as e: flash('Opps! Something unexpected happened. On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error') error = ErrorLog() error.user_id = current_user.id error.error = str(e.__class__) error.parameters = request.args db.session.add(error) db.session.commit() return redirect(request.referrer)
def authors_features(self): """ Create Authors SFrame in which each row is unique AuthorId and the author's various features :return: SFrame with Authors features :rtype: tc. SFrame """ p_sf = self._p_sf[['PaperId']] # 22082741 a_sf = self._mag.paper_author_affiliations["AuthorId", "PaperId"] a_sf = a_sf.join(p_sf, on="PaperId") a_sf = a_sf[["AuthorId"]].unique() g = self.get_authors_papers_dict_sframe() a_sf = a_sf.join(g, on="AuthorId", how="left") # 22443094 rows g = self.get_co_authors_dict_sframe() a_sf = a_sf.join(g, on="AuthorId", how='left') author_names = self._mag.author_names author_names["First Name"] = author_names["NormalizedName"].apply( lambda x: x.split(" ")[0]) a_sf = a_sf.join(author_names, on="AuthorId", how="left") g_sf = tc.load_sframe(str(FIRST_NAMES_SFRAME)) a_sf = a_sf.join(g_sf, on={"First Name": "First Name"}, how="left") feature_names = [("AffiliationId", "Affilation by Year Dict"), ('AuthorSequenceNumber', 'Sequence Number by Year Dict'), ("ConferenceSeriesId", "Conference ID by Year Dict"), ("JournalId", "Journal ID by Year Dict"), ("OriginalVenue", "Venue by Year Dict")] for fname, col_name in tqdm(feature_names): f_sf = self._get_author_feature_by_year_sframe(fname, col_name) a_sf = a_sf.join(f_sf, on="AuthorId", how='left') return a_sf
def remove_columns_page(): try: data_id = request.args.get('data_id') my_data = UserData.query.filter_by(id=data_id).first() my_model = TrainedModel() form = TrainModelForm(request.form, obj=my_model) data_frame = tc.load_sframe(my_data.sname) if request.method == 'POST': features_utf = request.form.getlist('features') features_str = [] for feat in features_utf: features_str.append(str(feat)) sframe = data_frame.remove_columns(features_str) fwd_id = save_data(my_data, request.form['name'], sframe) flash('Data transform is sucessful!', 'success') return redirect(url_for('data.data_details_page', data_id=fwd_id)) return render_template('pages/data/transforms/remove_columns.html', my_data=my_data, form=form, data_frame=data_frame, names=data_frame.column_names(), types=data_frame.column_types()) except Exception as e: flash('Opps! Something unexpected happened. On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error') error = ErrorLog() error.user_id = current_user.id error.error = str(e.__class__) error.parameters = request.args db.session.add(error) db.session.commit() return redirect(request.referrer)
def unique_page(): try: data_id = request.args.get('data_id') my_data = UserData.query.filter_by(id=data_id).first() my_model = TrainedModel() form = TrainModelForm(request.form, obj=my_model) data_frame = tc.load_sframe(my_data.sname) if request.method == 'POST': new_id = str(request.form['new_id']) name = str(request.form['name']) sf = data_frame.add_row_number(new_id) fwd_id = save_data(my_data, name, sf) flash('Successfully transformed the data!', 'success') return redirect(url_for('data.data_details_page', data_id=fwd_id)) return render_template('pages/data/transforms/unique.html', my_data=my_data, data_frame=data_frame, form=form) except Exception as e: flash('Opps! Something unexpected happened. On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error') error = ErrorLog() error.user_id = current_user.id error.error = str(e.__class__) error.parameters = request.args db.session.add(error) db.session.commit() return redirect(request.referrer)
def web_api_page(): try: model_id = request.args.get('model_id') my_model = TrainedModel.query.filter_by(id=model_id).first() my_data = UserData.query.filter_by(id=my_model.data_id).first() if my_data.user_id is not current_user.id: flash('Opps! Do data found', 'error') return redirect(request.referrer) data_frame = tc.load_sframe(my_data.sname) names=data_frame.column_names() types=data_frame.column_types() type_map = {} for x in range(0, names.__len__()): type_map[str(names[x])] = types[x] example_json = {} for feature in my_model.features['features']: example_json[feature] = type_map[feature].__name__ return render_template('pages/models/web_api.html', my_data=my_data, type_map=type_map, example_json=json.dumps(example_json, sort_keys = True, indent = 4, separators = (',', ': ')), my_model=my_model) except Exception as e: flash('Opps! Something unexpected happened. On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error') error = ErrorLog() error.user_id = current_user.id error.error = str(e.__class__) error.parameters = request.args db.session.add(error) db.session.commit() return redirect(request.referrer)
def __init__(self): self.imgframe = tc.load_sframe('model/final/final.sframe') self.model = tc.load_model('model/final/final_model') self.sample = tc.Image() self.results = SFrame() self.rows = SArray() self.pathlist = [] self.distance_list = []
def create_references_count_sframe(): """Creating SFrame with the number of references in each paper""" logger.info("Creating References Count SFrame") if os.path.isdir(PAPER_REFERENCES_COUNT_SFRAME): return r_sf = tc.load_sframe(PAPER_REFERENCES_SFRAME) sf = r_sf.groupby("Paper ID", {"Ref Number": agg.COUNT()}) sf.save(PAPER_REFERENCES_COUNT_SFRAME)
def get_valid_venues_papers_ids_sframe(min_ref_number, min_journal_papers_num): # Criteria I: we use only journals that have paper with valid DOI that appears in both AMiner and MAG datasets sf = tc.load_sframe(str(AMINER_MAG_JOIN_SFRAME)) sf['Original venue name'] = sf['Original venue name'].apply( lambda n: n.lower()) g = sf.groupby( 'Journal ID mapped to venue name', { 'venue name': agg.CONCAT('Original venue name'), 'issn': agg.CONCAT('issn') }) g['issn'] = g['issn'].apply(lambda l: list(set(l))) g['venue name'] = g['venue name'].apply(lambda l: list(set(l))) # Criteria II: the journal as only signle name g = g[g['venue name'].apply(lambda l: len(l) == 1)] g.materialize() g['venue name'] = g['venue name'].apply(lambda l: l[0].strip()) # Criteria III: the journal's name appears in SJR sjr_dict = VenueFetcher.get_sjr_journals_dict() g = g[g['venue name'].apply(lambda v: v in sjr_dict)] venues_ids = set(g['Journal ID mapped to venue name']) # Criteria IV: Each venue need to have at least min_journal_papers_num papers with at # least min_ref_number refs in each paper dataset_dir = pathlib.Path(STORAGE_PATH) mag_path = dataset_dir / "MAG" mag = MicrosoftAcademicGraph(mag_path) sf = mag.extended_papers['Journal ID mapped to venue name', 'Original venue name', 'Paper ID', 'Ref Number'] sf = sf[sf['Ref Number'] >= min_ref_number] sf.materialize() sf = sf[sf['Journal ID mapped to venue name'].apply( lambda i: i in venues_ids)] sf['Journal name'] = sf['Original venue name'].apply( lambda n: n.lower().strip()) sf.materialize() # Notice that with the full Papers SFrmae journal can have several names g = sf.groupby( ['Journal ID mapped to venue name'], { 'Count': agg.COUNT(), 'Paper IDs List': agg.CONCAT("Paper ID"), 'Journals names': agg.CONCAT('Journal name') }) g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l))) g = g[g['Count'] >= min_journal_papers_num] g = g[g['Journals names'].apply(lambda l: len(l) == 1)] g['Journals names'] = g['Journals names'].apply(lambda l: l[0]) g = g.rename({'Journals names': 'Journal name'}) g.materialize() return g
def predictions_step1_page(): # try: tc.config.set_num_gpus(0) model_id = request.args.get('model_id') my_model = TrainedModel.query.filter_by(id=model_id).first() my_data = UserData.query.filter_by(project_id=my_model.project_id).all() if my_data[0].user_id is not current_user.id: flash('Opps! Do data found', 'error') return redirect(request.referrer) form = UserProfileForm(request.form, obj=current_user) if request.method == 'POST': data_id = request.form['data_set_id'] my_data = UserData.query.filter_by(id=data_id).first() data_frame = tc.load_sframe(my_data.sname) if my_model.features['model_type'] == 'deep': tfrm = data_frame.to_dataframe() tfrm = tfrm.sort_values(by=[my_model.features["session_id"], my_model.features["time_field"]]) data_frame = tc.SFrame(data=tfrm) data_frame[str(my_model.features["session_id"])] = data_frame[str(my_model.features["session_id"])].astype(int) model = tc.load_model(my_model.mname) predictions = model.predict(data_frame).to_numpy() my_dict = Predictions() my_dict.model_id = my_model.id my_dict.user_id = current_user.id my_dict.path = my_model.path my_dict.input_file = my_data.name my_predictions = [] for item in predictions: my_predictions.append(str(item)) my_dict.predictions = my_predictions origs = [] for item in data_frame[str(my_model.features['target'])]: origs.append(str(item)) # Make sure the predictions only overwrite blank values if request.form['mode'] == "fill": size = len(predictions) for x in range(0, size): if origs[x] is not None: predictions[x] = origs[x] my_dict.originals = origs data_frame = safely_add_col('Predicted_Value', predictions, data_frame) my_dict.oname = os.path.join(my_dict.path, str(uuid.uuid4()) + "_model_predictions.csv") data_frame.save(my_dict.oname, format='csv') db.session.add(my_dict) db.session.commit() # Redirect to home page return redirect(url_for('model.prediction_page', dict=my_dict.id)) return render_template('pages/models/predict_step1.html', my_data=my_data, my_model=my_model, form=form)
def create_aminer_mag_sjr_sframe(year): """ Creates a unified SFrame of AMiner, MAG, and the SJR datasets :param year: year to use for SJR data :return: SFrame with AMiner, MAG, and SJR data :rtype: tc.SFrame """ sf = tc.load_sframe(AMINER_MAG_JOIN_SFRAME) sf = sf[sf['issn'] != None] sf = sf[sf['issn'] != 'null'] sf.materialize() r = re.compile(r"(\d+)-(\d+)") sf['issn_str'] = sf['issn'].apply(lambda i: "".join(r.findall(i)[0]) if len(r.findall(i)) > 0 else None) sf = sf[sf['issn_str'] != None] sjr_sf = tc.load_sframe(SJR_SFRAME) sjr_sf = sjr_sf[sjr_sf['Year'] == year] return sf.join(sjr_sf, on={'issn_str': "ISSN"})
def recode_step2_page(): try: data_id = request.args.get('data_id') target = request.args.get('target') name = request.args.get('name') my_data = UserData.query.filter_by(id=data_id).first() my_model = TrainedModel() form = TrainModelForm(request.form, obj=my_model) data_frame = tc.load_sframe(my_data.sname) names=data_frame.column_names() types=data_frame.column_types() orig_data = data_frame[str(target)] norig_data = orig_data.to_numpy() target_data = data_frame[str(target)].unique() ntarget_data = target_data.to_numpy() if request.method == 'POST': mapped_values = [] data_frame = safely_add_col(str(target) + '_uncoded', data_frame[str(target)], data_frame) for x in range(0, ntarget_data.__len__()): mapped_values.append(str(request.form['new_value' + str(x)])) cross_ref = [] for x in range(0, names.__len__()): if (str(types[x].__name__) == "str"): cross_ref.append(str(names[x])) new_data = [] for field in norig_data: for y in range(0, ntarget_data.__len__()): if str(ntarget_data[y]) == str(field): new_data.append(int(mapped_values[y])) sa = SArray(new_data) data_frame[str(target)] = sa fwd_id = save_data(my_data, name, data_frame) flash('Successfully re-coded ' + target + '!', 'success') return redirect(url_for('data.data_details_page', data_id=fwd_id)) return render_template('pages/data/transforms/code_field_step2.html', my_data=my_data, form=form, data_frame=data_frame, names=names, name=name, types=types, ntarget_data=ntarget_data, target=target) except Exception as e: flash('Opps! Something unexpected happened. On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error') error = ErrorLog() error.user_id = current_user.id error.error = str(e.__class__) error.parameters = request.args db.session.add(error) db.session.commit() return redirect(request.referrer)
def create_paper_keywords_list_sframe(): """ Creating Paper Keywords List SFrame """ logger.info("Creating Papers' Keywords List SFrame") if os.path.isdir(PAPER_KEYWORDS_LIST_SFRAME): return sf = tc.load_sframe(PAPER_KEYWORDS_SFRAME) g = sf.groupby("Paper ID", {"Keywords List": agg.CONCAT("Keyword name")}) g.save(PAPER_KEYWORDS_LIST_SFRAME)
def fill_na_page(): try: data_id = request.args.get('data_id') my_data = UserData.query.filter_by(id=data_id).first() my_model = TrainedModel() form = TrainModelForm(request.form, obj=my_model) data_frame = tc.load_sframe(my_data.sname) names=data_frame.column_names() types=data_frame.column_types() if request.method == 'POST': value = str(request.form['value']) name = str(request.form['name']) for feature in request.form.getlist('features'): orig_data = data_frame[str(feature)] print(orig_data.dtype.__name__) if orig_data.dtype.__name__ == "int": try: data_frame[str(feature)] = orig_data.fillna(int(value)) except Exception as e: flash('Opps! Looks like you passed something I could not parse as an integer.', 'error') return redirect(request.referrer) if orig_data.dtype.__name__ == "float": try: data_frame[str(feature)] = orig_data.fillna(float(value)) except Exception as e: flash('Opps! Looks like you passed something I could not parse as an float.', 'error') return redirect(request.referrer) if orig_data.dtype.__name__ == "str": try: data_frame[str(feature)] = orig_data.fillna(str(value)) except Exception as e: flash('Opps! Looks like you passed something I could not parse as an string.', 'error') return redirect(request.referrer) fwd_id = save_data(my_data, name, data_frame) flash('Successfully replaced N/A values!', 'success') return redirect(url_for('data.data_details_page', data_id=fwd_id)) return render_template('pages/data/transforms/fill_na.html', my_data=my_data, data_frame=data_frame, names=names, types=types, form=form) except Exception as e: flash('Opps! Something unexpected happened. On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error') error = ErrorLog() error.user_id = current_user.id error.error = str(e.__class__) error.parameters = request.args db.session.add(error) db.session.commit() return redirect(request.referrer)
def _create_field_of_study_paper_ids_sframe(level): """ Create SFrame in which each row contains a field of study and it's matching list of paper ids :param level: field of study level :return: SFrame with the fields of stuyd in the input level papers ids :rtype: tc.SFrame """ logger.info("Creating fields os study paper ids SFrame level - %s " % level) col = 'Fields of study parent list (L%s)' % level sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME) new_col_name = "Field ID" sf = sf.stack(col, new_column_name=new_col_name) sf = sf[sf[col] != None] g = sf.groupby(new_col_name, {'Paper IDs': agg.CONCAT("Paper ID")}) f_sf = tc.load_sframe(FIELDS_OF_STUDY_SFRAME) g = g.join(f_sf, on={new_col_name: "Field of study ID"}) g['Number of Paper'] = g['Paper IDs'].apply(lambda l: len(l)) g['Level'] = level g = g.rename({new_col_name: "Field of study ID"}) return g
def create_extended_papers_sframe(): """ Created extended papers SFrame which contains various papers features, such as paper citation numbers, authors list, urls,.. etc :return: """ logger.info("Creating Extended Papers SFrame") if os.path.isdir(EXTENDED_PAPERS_SFRAME): return sf = tc.load_sframe(PAPERS_SFRAME) sframes_list = [ PAPER_REFERENCES_COUNT_SFRAME, PAPERS_CITATIONS_BYYEAR_SFRAME, PAPERS_ORDERED_AUTHORS_LIST_SFRAME, PAPER_KEYWORDS_LIST_SFRAME, PAPERS_FIELDS_OF_STUDY_SFRAME, PAPER_URLS_SFRAME ] for s in sframes_list: t = tc.load_sframe(s) sf = sf.join(t, how="left", on="Paper ID") sf.save(EXTENDED_PAPERS_SFRAME) sf = sf.fillna("Ref Number", 0) sf.save(EXTENDED_PAPERS_SFRAME)
def get_papers_sframe(min_ref_num=None, start_year=None, end_year=None): """ Return SFrame with Papers data accoring to the input filter variables :param min_ref_num: paper's minimal references number :param start_year: start year (only include paper that were published after start year) :param end_year: end year (only include paper that were published before end year) :return: SFrame with paper data :rtype: tc.SFrame :note: after the SFrame is created it is saved to the TMP_DIR to future use """ sf = tc.load_sframe(PAPER_REFERENCES_SFRAME) tmp_papers_sf_path = _get_tmp_papers_sframe_path(min_ref_num, start_year, end_year) if os.path.isdir(tmp_papers_sf_path): return tc.load_sframe(tmp_papers_sf_path) if min_ref_num is not None: logger.info( f"Getting papers ids with at least refrences {min_ref_num}") sf = sf.groupby( 'Paper ID', {'Ref Count': agg.COUNT()}) # There are 30058322 in the list sf = sf[sf['Ref Count'] >= min_ref_num] # left with 22,083,058 sf.__materialize__() p_sf = tc.load_sframe(PAPERS_SFRAME) sf = p_sf.join(sf) if start_year is not None: logger.info("Getting papers with from %s " % start_year) sf = sf[sf['Paper publish year'] >= start_year] if end_year is not None: logger.info("Getting papers with util %s " % end_year) sf = sf[sf['Paper publish year'] <= end_year] sf.__materialize__() if not os.path.isdir(tmp_papers_sf_path): sf.save(tmp_papers_sf_path) return sf
def create_extended_references_sframe(): """ Create SFrame with references data with additional column that state if the reference is self-citation """ logger.info("Creating Extended References SFrame") if os.path.isdir(EXTENDED_PAPER_REFERENCES_SFRAME): return ref_sf = tc.load_sframe(PAPER_REFERENCES_SFRAME) p_sf = tc.load_sframe(PAPERS_ORDERED_AUTHORS_LIST_SFRAME) ref_sf = ref_sf.join(p_sf, on='Paper ID', how="left") ref_sf = ref_sf.join(p_sf, on={'Paper reference ID': 'Paper ID'}, how="left") ref_sf = ref_sf.fillna('Authors List Sorted.1', []) ref_sf = ref_sf.fillna('Authors List Sorted', []) ref_sf.__materialize__() ref_sf['self citation'] = ref_sf.apply(lambda r: len( set(r['Authors List Sorted.1']) & set(r['Authors List Sorted']))) ref_sf.__materialize__() ref_sf = ref_sf.remove_columns( ['Authors List Sorted.1', 'Authors List Sorted']) ref_sf.save(EXTENDED_PAPER_REFERENCES_SFRAME)
def wrapper_repeat(self, *args, **kwargs): sframe_path = pathlib.Path(self._sframe_dir).joinpath(sframe) if not sframe_path.exists(): table_name = sframe.split(".")[0] if table_name in MAG_URL_DICT: url = MAG_URL_DICT[table_name] mag_file = self._dataset_dir / re.search(".*files\/(.*?)\?", url).group(1) if not pathlib.Path(mag_file).exists(): download_file(url, mag_file) value = func(self, *args, **kwargs) value.save(str(sframe_path)) else: value = load_sframe(str(sframe_path)) return value
def load_functions_partition(directory, name): if name is None: name = '' logging.info(f"Loading functions from {directory}{name}") mw = tc.load_sframe(f"{directory}{name}") if 'fcount' in mw.column_names(): mw.remove_column('fcount', inplace=True) if 'hapk' in mw.column_names(): mw.rename(names={'hapk': 'apk'}, inplace=True) if 'hfunc' in mw.column_names(): mw.rename(names={'hfunc': 'function'}, inplace=True) return mw
def create_mod(path): if os.path.isdir(path + '/data.sframe'): print('reference_data is existed') reference_data = tc.load_sframe(path + '/data.sframe') else: # Load images from the downloaded data reference_data = tc.image_analysis.load_images(path) reference_data = reference_data.add_row_number() reference_data.save(path + '/data.sframe') if os.path.isdir(path + '/savedmodel.model'): print('mod is existed') model = tc.load_model(path + '/savedmodel.model') else: # Save the SFrame for future use model = tc.image_similarity.create(reference_data) model.save(path + '/savedmodel.model') return reference_data, model
def outlliers_page(): try: data_id = request.args.get('data_id') my_data = UserData.query.filter_by(id=data_id).first() my_model = TrainedModel() form = TrainModelForm(request.form, obj=my_model) data_frame = tc.load_sframe(my_data.sname) names=data_frame.column_names() types=data_frame.column_types() if request.method == 'POST': cent = float(request.form['cent']) name = str(request.form['name']) target = str(request.form['target']) mean = data_frame[target].mean() rows = [] for row in data_frame: if row[target] is not None: diff = abs(float(row[target]) - mean) pdiff = diff/mean if pdiff < cent: rows.append(row) else: rows.append(row) sf = tc.SFrame(rows) sf = sf.unpack('X1', column_name_prefix='') print(sf) fwd_id = save_data(my_data, name, sf) flash('Successfully removed outliers!', 'success') return redirect(url_for('data.data_details_page', data_id=fwd_id)) return render_template('pages/data/transforms/outlier.html', my_data=my_data, data_frame=data_frame, names=names, types=types, form=form) except Exception as e: flash('Opps! Something unexpected happened. On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error') error = ErrorLog() error.user_id = current_user.id error.error = str(e.__class__) error.parameters = request.args db.session.add(error) db.session.commit() return redirect(request.referrer)
def extract_imgs_from_sframe(sframe, target_label='mainPlate', buffer=64, draw_center=False, draw_center_line=False, draw_boundings=False, draw_masks=False, draw_frame_num=True, annotations_col='annotations', image_col='image', masks_col='stateMasks'): sf = list(tc.load_sframe(sframe)) frames = [] frame_num = 0 centers = {} for x in tqdm(sf, desc='Parsing'): img = x[image_col].pixel_data append_centers(x[annotations_col], centers, buffer=buffer, target_label=target_label) if draw_boundings: img = tc.object_detector.util.draw_bounding_boxes( get_tc_img(img), x[annotations_col]).pixel_data if draw_masks: img = draw_mask_data(img, x[masks_col]) if draw_center: img = draw_nearest_centers(img, centers) if draw_center_line: img = draw_center_lines(img, centers, buffer=buffer) if draw_frame_num: img = draw_text(img, str(frame_num)) frames.append(img) frame_num += 1 return frames
def get_accuracy(path='.'): reference_data = tc.load_sframe(path + '/data.sframe') if os.path.isdir(path + '/savedmodel.model'): model = tc.load_model(path + '/savedmodel.model') else: model = tc.image_similarity.create(reference_data) model.save(path + '/savedmodel.model') correct = 0 mistake = 0 index = 0 distance = 0 while index < len(reference_data): if index + step_length < len(reference_data): query_results = model.query(reference_data[index:index + step_length], k=k, verbose=False) index += step_length else: query_results = model.query(reference_data[index:], k=k, verbose=False) index = len(reference_data) assert len(query_results) % k == 0, 'length error!' for i in range(int(len(query_results) / k)): category = [ reference_data[query_results[i * k + j]['reference_label']] ['path'].split('/')[3] for j in range(k) ] if category[0] == category[1] or (('Faces' in category[0]) and ('Faces' in category[1])): correct += 1 else: mistake += 1 for j in range(k): distance += query_results[i * k + j]['distance'] if (index + 1) % 1000 == 0: print(str(index + 1) + ' completed!') print('正确个数为:' + str(correct)) print('错误个数为:' + str(mistake)) print('正确率为:' + str(correct / (correct + mistake))) print('平均距离为: ' + str(distance / len(reference_data)))
def convert_magic_page(): try: data_id = request.args.get('data_id') my_data = UserData.query.filter_by(id=data_id).first() my_model = TrainedModel() form = TrainModelForm(request.form, obj=my_model) data_frame = tc.load_sframe(my_data.sname) names=data_frame.column_names() types=data_frame.column_types() if request.method == 'POST': magic = str(request.form['magic']) name = str(request.form['name']) for feature in request.form.getlist('features'): orig_data = data_frame[str(feature)] norig_data = orig_data.to_numpy() new_data = [] for item in norig_data: if str(item) == magic: new_data.append(None) else: new_data.append(item) sa = SArray(new_data) data_frame[str(feature)] = sa fwd_id = save_data(my_data, name, data_frame) flash('Successfully cleared magic values!', 'success') return redirect(url_for('data.data_details_page', data_id=fwd_id)) return render_template('pages/data/transforms/convert_magic.html', my_data=my_data, data_frame=data_frame, names=names, types=types, form=form) except Exception as e: flash('Opps! Something unexpected happened. On the brightside, we logged the error and will absolutely look at it and work to correct it, ASAP.', 'error') error = ErrorLog() error.user_id = current_user.id error.error = str(e.__class__) error.parameters = request.args db.session.add(error) db.session.commit() return redirect(request.referrer)