示例#1
0
def discover_network(network, api_list):
    # """ first, load all of the previous influencers and relationships """
    influencers, infl_header = helpers.load_csv(
        dirs.dirs_dict["discoveries"][network])
    influencer_user_ids = set(helpers.list_of_keys(influencers, 'user_id'))
    # print(influencers)
    relationships_loaded, rel_header = helpers.load_csv(
        dirs.dirs_dict["relationships"][network])
    # """ next, check which users we've already searched """
    user_ids_searched = set(
        int(rel["follower_id"]) for rel in relationships_loaded)
    # """ we'll only search influencers who we haven't searched """
    influencers_to_search = filter(
        lambda i: i["user_id"] and int(i["user_id"]) not in user_ids_searched,
        influencers
    )  # [i for i in influencers if i["user_id"] and int(i["user_id"]) not in user_ids_searched]
    # print(influencers_to_search)

    # """ while there are influencers to search, we should search one of them """
    # only expand if the new degree is less than or equal
    # to a maximum degree for expanding the network,
    # so that we don't run into low quality people.
    # if new_degree <= configs.MAX_EXPANSION_DEGREE:
    while max(influencers_to_search, key=helpers.influencer_norm
              )['degree'] <= configs.MAX_EXPANSION_DEGREE:
        infl = max(influencers_to_search, key=helpers.influencer_norm)
        print("chose: " + str(infl))

        # this holds true if we first exhaust all degree 0's
        # and then go to degree 1s, 2s, etc.
        new_degree = infl['degree'] + 1

        try:
            follows = random.choice(api_list).get_follows(infl["user_id"])
            print("this person follows: " + str(follows))

            profile = social_apis.Profile(network,
                                          infl,
                                          infl['degree'],
                                          follows_list=follows)
            relationships_loaded.extend(profile.get_follows())
            profile.flush_follows()

            for fid in follows:
                flushed_dict = flush_followed_user(random.choice(api_list),
                                                   network, fid,
                                                   influencer_user_ids,
                                                   new_degree)
                if not flushed_dict:
                    continue
                influencers.append(flushed_dict)
                influencer_user_ids.add(flushed_dict['user_id'])

                influencers_to_search.append(flushed_dict)
        except:
            print("An error occured - onto the next one")

        influencers_to_search.remove(infl)

    return None
示例#2
0
def discover_network(network, api_list):
    # """ first, load all of the previous influencers and relationships """
    influencers, infl_header = helpers.load_csv(dirs.dirs_dict["discoveries"][network])
    influencer_user_ids = set(helpers.list_of_keys(influencers, "user_id"))
    # print(influencers)
    relationships_loaded, rel_header = helpers.load_csv(dirs.dirs_dict["relationships"][network])
    # """ next, check which users we've already searched """
    user_ids_searched = set(int(rel["follower_id"]) for rel in relationships_loaded)
    # """ we'll only search influencers who we haven't searched """
    influencers_to_search = filter(
        lambda i: i["user_id"] and int(i["user_id"]) not in user_ids_searched, influencers
    )  # [i for i in influencers if i["user_id"] and int(i["user_id"]) not in user_ids_searched]
    # print(influencers_to_search)

    # """ while there are influencers to search, we should search one of them """
    # only expand if the new degree is less than or equal
    # to a maximum degree for expanding the network,
    # so that we don't run into low quality people.
    # if new_degree <= configs.MAX_EXPANSION_DEGREE:
    while max(influencers_to_search, key=helpers.influencer_norm)["degree"] <= configs.MAX_EXPANSION_DEGREE:
        infl = max(influencers_to_search, key=helpers.influencer_norm)
        print("chose: " + str(infl))

        # this holds true if we first exhaust all degree 0's
        # and then go to degree 1s, 2s, etc.
        new_degree = infl["degree"] + 1

        try:
            follows = random.choice(api_list).get_follows(infl["user_id"])
            print("this person follows: " + str(follows))

            profile = social_apis.Profile(network, infl, infl["degree"], follows_list=follows)
            relationships_loaded.extend(profile.get_follows())
            profile.flush_follows()

            for fid in follows:
                flushed_dict = flush_followed_user(
                    random.choice(api_list), network, fid, influencer_user_ids, new_degree
                )
                if not flushed_dict:
                    continue
                influencers.append(flushed_dict)
                influencer_user_ids.add(flushed_dict["user_id"])

                influencers_to_search.append(flushed_dict)
        except:
            print("An error occured - onto the next one")

        influencers_to_search.remove(infl)

    return None
示例#3
0
    def test_load_data(self):
        convertfunc = lambda x: 0 if b'b' in x else 1  # convertfucntion for Prediction column to 0 if bg, and 1 if signal
        converters = {"Prediction": convertfunc}
        data = load_csv(self.path, converters=converters)

        self.assertEqual(data[0]['Id'], data[0][0])  #test dtype naming
        self.assertEqual(data[0]["Prediction"],
                         1)  # test conversion of Prediction strings
        self.assertEqual(data.shape, (10, ))
示例#4
0
def drop_nones():
    rows, header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"])

    new_rows = list()
    for row in rows:
        if row["username"] and row["user_id"]:
            new_rows.append(row)
    helpers.write_csv(dirs.dirs_dict["discoveries"]["instagram"], new_rows, header)

    return
示例#5
0
def write_ranks(rks_dict):
    discovery_list, discovery_header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"])
    # fixes bug where some influencers are reported as None
    discovery_list = filter(lambda infl: bool(infl['username']), discovery_list)

    for i, el in enumerate(discovery_list):
        discovery_list[i]["pagerank"] = rks_dict[el["user_id"]]
    discovery_header.append("pagerank")
    discovery_list.sort(key=lambda k: k["pagerank"], reverse=True)
    helpers.write_csv(dirs.dirs_dict["discoveries"]["instagram"]+"-pageranked", discovery_list, discovery_header)
    return None
示例#6
0
def drop_nones():
    rows, header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"])

    new_rows = list()
    for row in rows:
        if row['username'] and row['user_id']:
            new_rows.append(row)
    helpers.write_csv(dirs.dirs_dict["discoveries"]["instagram"], new_rows,
                      header)

    return
示例#7
0
def dedup(folder, network, on_keys):
    rows, header = helpers.load_csv(dirs.dirs_dict[folder][network])
    if not rows:
        return

    stored_keys = set()
    new_rows = list()
    for row in rows:
        row_key = tuple(row[on_key] for on_key in on_keys)
        if row_key not in stored_keys:
            new_rows.append(row)
            stored_keys.add(row_key)
    helpers.write_csv(dirs.dirs_dict[folder][network], new_rows, header)
    return
示例#8
0
def dedup(folder, network, on_keys):
    rows, header = helpers.load_csv(dirs.dirs_dict[folder][network])
    if not rows:
        return

    stored_keys = set()
    new_rows = list()
    for row in rows:
        row_key = tuple(row[on_key] for on_key in on_keys)
        if row_key not in stored_keys:
            new_rows.append(row)
            stored_keys.add(row_key)
    helpers.write_csv(dirs.dirs_dict[folder][network], new_rows, header)
    return
示例#9
0
def main():
    t0 = time.time()
    print("Fetching training and testing datasets..")
    tr = load_txt("data/train_set.txt")
    tr_x = tr[0]
    tr_y = tr[1]

    ts_x_raw = load_xlsx("data/test_set.xlsx")
    ts_x = [row["A"] for row in ts_x_raw]

    ts_y_raw = load_txt("data/test_set_y.txt")
    ts_y = ts_y_raw[1]
    ts_y = ts_y[0:len(ts_y) - 1]  # because there's a new line at the end

    if compare_datasets:
        # Check our test labels against Eysteinn's
        ts_y_alternate = load_csv("data/test_dataset.csv")
        different = []
        for i in range(len(ts_y_alternate)):
            if ts_y[i] is not ts_y_alternate[i]:
                different.append(i)
        print("Number of different entries:")
        print(len(different))
        print(different)

    print("Creating features from training set..")
    vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), lowercase=True)
    tr_vectors = vectorizer.fit_transform(tr_x)

    print("Creating MultinomialNB classifier..")
    clf = MultinomialNB()
    clf.fit(tr_vectors, tr_y)
    ts_x_featurized = vectorizer.transform(ts_x)

    print("Making predictions..")
    predictions = clf.predict(ts_x_featurized)
    t1 = time.time()
    dt = t1 - t0
    i = 0
    correct_predictions = 0
    for row in predictions:
        if row == ts_y[i]:
            correct_predictions = correct_predictions + 1
        i = i + 1

    print("Result: %d/%d correct predictions (%.2f%%), in %.2fs.\n" %
          (correct_predictions, len(predictions),
           100. * correct_predictions / len(predictions), dt))
    print(classification_report(ts_y, predictions))
示例#10
0
def write_ranks(rks_dict):
    discovery_list, discovery_header = helpers.load_csv(
        dirs.dirs_dict["discoveries"]["instagram"])
    # fixes bug where some influencers are reported as None
    discovery_list = filter(lambda infl: bool(infl['username']),
                            discovery_list)

    for i, el in enumerate(discovery_list):
        discovery_list[i]["pagerank"] = rks_dict[el["user_id"]]
    discovery_header.append("pagerank")
    discovery_list.sort(key=lambda k: k["pagerank"], reverse=True)
    helpers.write_csv(
        dirs.dirs_dict["discoveries"]["instagram"] + "-pageranked",
        discovery_list, discovery_header)
    return None
示例#11
0
def load_data(file_name):
    global data_df
    global features_list
    global source
    try:
        read_data_df = load_csv(os.path.join(csv_data_dir, file_name))
    except Exception as e:
        print(e)
    data_df = read_data_df
    features_list = [
        i for i in list(data_df.columns)
        if i not in ['tsne_x', 'tsne_y', 'image_path']
    ]
    # source = ColumnDataSource(data_df)
    cr.data_source.data = data_df
    toggle_class_select.options = features_list
    toggle_class_select.value = features_list[0]
    update_toggle_class('value', None, features_list[0])
    color_class_select.options = features_list
    color_class_select.value = features_list[0]
    update_color_class('value', None, features_list[0])
    update_class_selection('value', None, [])
    hover_tip_tool.tooltips = generate_tooltip_html()
示例#12
0
        i for i in list(data_df.columns)
        if i not in ['tsne_x', 'tsne_y', 'image_path']
    ]
    # source = ColumnDataSource(data_df)
    cr.data_source.data = data_df
    toggle_class_select.options = features_list
    toggle_class_select.value = features_list[0]
    update_toggle_class('value', None, features_list[0])
    color_class_select.options = features_list
    color_class_select.value = features_list[0]
    update_color_class('value', None, features_list[0])
    update_class_selection('value', None, [])
    hover_tip_tool.tooltips = generate_tooltip_html()


data_df = load_csv('./embedding_visualization/data/show.csv')
features_list = [
    i for i in list(data_df.columns) if i not in ['tsne_x', 'tsne_y']
]
source = ColumnDataSource(data_df)

cls_color_mapper, color_cls_list, _ = get_color_mapper(features_list[0])
p = figure(plot_width=800,
           plot_height=800,
           match_aspect=True,
           tools=['pan', 'box_zoom', 'reset'],
           title='',
           sizing_mode='scale_height',
           output_backend="webgl")
cr = p.circle(x='tsne_x', y='tsne_y', color=cls_color_mapper, source=source)
cr.selection_glyph = Circle(fill_color=cls_color_mapper,
示例#13
0
 def test_load_predict_data(self):
     data = load_csv('test/unlabeled_dummy.csv')
示例#14
0
    def test_split_data(self):
        data = load_csv(self.path)

        trainset, testset = split_dataset(data, test_ratio=0.4)
        self.assertEqual(trainset.shape, (6, ))
        self.assertEqual(testset.shape, (4, ))
示例#15
0
 def test_write_csv(self):
     convertfunc = lambda x: 0 if b'b' in x else 1  # convertfucntion for Prediction column to 0 if bg, and 1 if signal
     converters = {"Prediction": convertfunc}
     data = load_csv(self.path, converters=converters)
     write_csv(data, "test/test_write.csv")
示例#16
0
# Bind the engine to the metadata of the Base class so that the
# declaratives can be accessed through a DBSession instance
Base.metadata.bind = engine
 
DBSession = sessionmaker(bind=engine)
# A DBSession() instance establishes all conversations with the database
# and represents a "staging zone" for all the objects loaded into the
# database session object. Any change made against the objects in the
# session won't be persisted into the database until you call
# session.commit(). If you're not happy about the changes, you can
# revert all of them back to the last commit by calling
# session.rollback()
session = DBSession()
 

influencers, infl_header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"])

# Insert a Profile in the profile table
for influencer in influencers:
    for key in ('time_pulled', ):
        influencer.pop(key)
    profile = Profile(**influencer)
    profile_record, created = get_or_create(session, Profile, defaults=influencer, user_id=influencer['user_id'])
    print('created' if created else '--existed')






示例#17
0
def main():
    t0 = time.time()
    print("Fetching training and testing datasets..")
    tr = load_txt("data/train_set.txt")
    tr_x = tr[0]
    tr_y = tr[1]

    ts_x_raw = load_xlsx("data/test_set.xlsx")
    ts_x = [row["A"] for row in ts_x_raw]

    ts_y_raw = load_txt("data/test_set_y.txt")
    ts_y = ts_y_raw[1]
    ts_y = ts_y[0:len(ts_y) - 1]  # because there's a new line at the end

    if compare_datasets:
        # Check our test labels against Eysteinn's
        ts_y_alternate = load_csv("data/test_dataset.csv")
        different = []
        for i in range(len(ts_y_alternate)):
            if ts_y[i] is not ts_y_alternate[i]:
                different.append(i)
        print("Number of different entries:")
        print(len(different))
        print(different)

    print("Creating features from training set..")
    vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), lowercase=True)
    tr_vectors = vectorizer.fit_transform(tr_x)

    print("Grid searching params for SVM classifier..")
    #clf = MultinomialNB()
    params = {
        'kernel': ('linear', 'poly', 'rbf'),
        'C': [1, 10],
        'degree': [2, 3, 4, 5],
        'coef0': [5, 7, 10, 15, 17, 20]
    }
    #params = {'kernel':['poly'], 'C':[10], 'degree':[3], 'coef0':[5]}
    bestclf = 0
    bestRes = 0
    bestPredictions = 0
    for classifier in params['kernel']:
        for c in params['C']:
            for d in params['degree']:
                for coef in params['coef0']:
                    clf = svm.SVC(kernel=classifier, C=c, degree=d, coef0=coef)
                    clf.fit(tr_vectors, tr_y)
                    ts_x_featurized = vectorizer.transform(ts_x)
                    predictions = clf.predict(ts_x_featurized)
                    t1 = time.time()
                    i = 0
                    correct_predictions = 0
                    for row in predictions:
                        if row == ts_y[i]:
                            correct_predictions = correct_predictions + 1
                        i = i + 1
                    if correct_predictions > bestRes:
                        bestRes = correct_predictions
                        bestclf = clf
                        bestPredictions = predictions
                        print('kernel:', classifier, 'C:', c, 'degree:', d,
                              'coef0:', coef)
                        print('Numcorrect', bestRes)

    dt = t1 - t0
    print("Result: %d/%d correct predictions (%.2f%%), in %.2fs.\n" %
          (bestRes, len(bestPredictions),
           100. * bestRes / len(bestPredictions), dt))
    print(classification_report(ts_y, bestPredictions))
示例#18
0
        session.add(instance)
        session.commit()
        return instance, True


engine = create_engine(configs.DB_NAME)
# Bind the engine to the metadata of the Base class so that the
# declaratives can be accessed through a DBSession instance
Base.metadata.bind = engine

DBSession = sessionmaker(bind=engine)
# A DBSession() instance establishes all conversations with the database
# and represents a "staging zone" for all the objects loaded into the
# database session object. Any change made against the objects in the
# session won't be persisted into the database until you call
# session.commit(). If you're not happy about the changes, you can
# revert all of them back to the last commit by calling
# session.rollback()
session = DBSession()


influencers, infl_header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"])

# Insert a Profile in the profile table
for influencer in influencers:
    for key in ("time_pulled",):
        influencer.pop(key)
    profile = Profile(**influencer)
    profile_record, created = get_or_create(session, Profile, defaults=influencer, user_id=influencer["user_id"])
    print("created" if created else "--existed")
示例#19
0
def vaccines():
    """Displays vaccine tracker"""
    vaccines = {
        "MCV1":
        "The percentage of children under 1 year of age who have received at least one dose of measles-containing vaccine in a given year. For countries recommending the first dose of measles vaccine in children over 12 months of age, the indicator is calculated as the proportion of children less than 12-23 months of age receiving one dose of measles-containing vaccine.",
        "MCV2":
        "The percentage of children who have received two doses of measles containing vaccine (MCV2) in a given year, according to the nationally recommended schedule.",
        "BCG":
        "The percentage of 1-year-olds who have received one dose of bacille Calmette-Guérin (BCG) vaccine in a given year.",
        "DTP3":
        "The percentage of 1-year-olds who have received three doses of the combined diphtheria, tetanus toxoid and pertussis vaccine in a given year.",
        "PAB":
        "The proportion of neonates in a given year that can be considered as having been protected against tetanus as a result of maternal immunization.",
        "PCV3":
        "The percentage of 1-year-olds who have received three doses of pneumococcal conjugate vaccine (PCV3) in a given year.",
        "HepB3":
        "The percentage of 1-year-olds who have received three doses of hepatitis B vaccine in a given year.",
        "Pol3":
        "The percentage of 1-year-olds who have received three doses of polio vaccine in a given year.",
        "Hib3":
        "The percentage of 1-year-olds who have received three doses of Haemophilus influenzae type B vaccine in a given year.",
        "ROTAC":
        "The percentage of surviving infants who received the final recommended dose of rotavirus vaccine, which can be either the 2nd or the 3rd dose depending on the vaccine in a given year."
    }

    # Dictionary of dictionaries for vaccine data
    all_vaccines = {}
    for vaccine in vaccines:
        all_vaccines[vaccine] = load_csv(vaccine)
    if request.method == "GET":
        return render_template("vaccines.html", all_vaccines=all_vaccines)
    if request.method == "POST":
        if "search" in request.form:
            # Check that user entered vaccine abbreviation
            if not request.form.get("abbr"):
                flash("Must enter vaccine abbr.")
                return redirect("/vaccines")

            # Check if user input is an actual vaccine
            abbr = request.form.get("abbr")
            if abbr not in vaccines:
                flash("Data unavailable")
                return redirect("/vaccines")
            else:
                return render_template("vaccinedata.html",
                                       vaccine=abbr,
                                       vaccine_data=all_vaccines[abbr],
                                       vaccine_info=vaccines[abbr],
                                       startyear=1980,
                                       endyear=2018)

        if "datasearch" in request.form:
            vaccine = request.form["datasearch"]
            # If nothing is inputted, refresh original page
            if not request.form.get("country") and not request.form.get(
                    "startyear") and not request.form.get("endyear"):
                return render_template("vaccinedata.html",
                                       vaccine=vaccine,
                                       vaccine_data=all_vaccines[vaccine],
                                       vaccine_info=vaccines[vaccine],
                                       startyear=1980,
                                       endyear=2018)

            # Check that country exists
            country = request.form["country"]
            if country != "" and country not in all_vaccines[vaccine]:
                flash("Country data not available")
                return render_template("vaccinedata.html",
                                       vaccine=vaccine,
                                       vaccine_data=all_vaccines[vaccine],
                                       vaccine_info=vaccines[vaccine],
                                       startyear=1980,
                                       endyear=2018)

            # Check that years are valid
            startyear = request.form["startyear"]
            endyear = request.form["endyear"]
            if startyear != "" and (int(startyear) > 2018
                                    or int(startyear) < 1980):
                flash("Invalid start year")
                return render_template("vaccinedata.html",
                                       vaccine=vaccine,
                                       vaccine_data=all_vaccines[vaccine],
                                       vaccine_info=vaccines[vaccine],
                                       startyear=1980,
                                       endyear=2018)
            if endyear != "" and (int(endyear) > 2018 or int(endyear) < 1980):
                flash("Invalid end year")
                return render_template("vaccinedata.html",
                                       vaccine=vaccine,
                                       vaccine_data=all_vaccines[vaccine],
                                       vaccine_info=vaccines[vaccine],
                                       startyear=1980,
                                       endyear=2018)
            if startyear != "" and endyear != "" and int(startyear) > int(
                    endyear):
                flash("End year must be later than start year")
                return render_template("vaccinedata.html",
                                       vaccine=vaccine,
                                       vaccine_data=all_vaccines[vaccine],
                                       vaccine_info=vaccines[vaccine],
                                       startyear=1980,
                                       endyear=2018)

            start = 1980
            if startyear != "":
                start = int(startyear)
            end = 2018
            if endyear != "":
                end = int(endyear)

            spliced_dict = {}
            if country != "":
                spliced_dict[country] = all_vaccines[vaccine][country]
            else:
                spliced_dict = all_vaccines[vaccine]
            for country in spliced_dict:
                spliced_dict[country] = spliced_dict[country][2018 - end:2019 -
                                                              start]

            return render_template("vaccinedata.html",
                                   vaccine=vaccine,
                                   vaccine_data=spliced_dict,
                                   vaccine_info=vaccines[vaccine],
                                   startyear=start,
                                   endyear=end)

        for vaccine in vaccines:
            if vaccine in request.form:
                return render_template("vaccinedata.html",
                                       vaccine=vaccine,
                                       vaccine_data=all_vaccines[vaccine],
                                       vaccine_info=vaccines[vaccine],
                                       startyear=1980,
                                       endyear=2018)