def main():
    trail_df, weather_df = dg.get_data()
    merged = dg.merge_dataframes(trail_df, weather_df)
    # utils.print_heads(merged=merged)
    # print('merged data')
    pseudo_clock(merged)
    pseudo_clock_NN(merged)
示例#2
0
def get_answer(topic_id, topic_name):
    # 不考虑云端数据变化,step小于limit时,取样必然有重叠
    time.perf_counter()  # 计时
    zhi = ZhiHu()
    func = zhi.questions.answers

    question_list = load_question(topic_name)
    total_num = len(question_list)
    print("[获取回答] ===== 正在准备请求 {} 共有问题数 {} =====".format(topic_name, total_num))
    i = 0
    answer_all = []
    fetch_body = []
    for question in question_list:
        i += 1
        question_id = question[0]
        question_ansnum = int(question[1])

        fetch_body.append({"identifier": question_id,
                           "query_args": ["content"],
                           "range": [0, question_ansnum]})
        # break

    print("[获取回答] ===== 正在请求数据 {} 共有问题数 {} =====".format(topic_name, total_num))
    res = get_data(fetch_body, func, process_num=PROCESS_NUM,
                   max_process_num=MAX_PROCESS_NUM,
                   flood_discharge_ratio=FLOOD_DISCHARGE_RATIO,
                   floodplain_ratio=FLOODPLAIN_RATIO,
                   headers_pool=HEADERS_POOL)

    # print(res)

    print("[获取回答] ===== 正在处理数据 {} 共有问题数 {} =====".format(topic_name, total_num))
    i = 0
    for question_id, question_result in res.items():
        i += 1
        answer_list = question_result["data"]
        if i % 1000 == 0:
            print("[处理问题 {} / {}]".format(i, total_num), question_id)
        for item in answer_list:
            answer_id = item["id"]
            raw_ans = item["content"]
            question_content = item["question"]["title"]
            answer_content = remove_tags(raw_ans)
            answer_all.append((question_id, answer_id, question_content, answer_content))

    print("[获取回答] ===== 正在保存数据 {} 共有问题数 {} =====".format(topic_name, total_num))
    file_name = str(topic_name) + "_answers.csv"
    file_path = os.path.join("./data", file_name)
    with open(file_path, "a", encoding="utf-8-sig", newline='') as file:
        writer = csv.writer(file)
        for item in answer_all:
            writer.writerows([item])
示例#3
0
def get_topic(topic_id, topic_name, topic_ansnum):
    # 不考虑云端数据变化,step小于limit时,取样必然有重叠
    time.perf_counter()  # 计时
    zhi = ZhiHu()
    func = zhi.topic.timeline_question

    print("[获取问题] ===== 正在获取数据 {} 共有回答数 {} =====".format(
        topic_name, topic_ansnum))

    fetch_dict = [{
        "identifier": topic_id,
        "query_args": ["answer_count"],
        "range": [0, topic_ansnum]
    }]

    res = get_data(fetch_dict,
                   func,
                   process_num=PROCESS_NUM,
                   max_process_num=MAX_PROCESS_NUM,
                   flood_discharge_ratio=FLOOD_DISCHARGE_RATIO,
                   floodplain_ratio=FLOODPLAIN_RATIO,
                   headers_pool=HEADERS_POOL)

    print("[获取问题] ===== 正在处理回答 {} 共有回答数 {} =====".format(
        topic_name, topic_ansnum))

    data = res[topic_id]["data"]
    num = len(data)
    question_set = set()
    for item in data:
        question_id = item["target"]["id"]
        question_title = item["target"]["title"]
        ans_count = item["target"]["answer_count"]
        if ans_count == 0:
            continue
        question_set.add((question_id, ans_count, question_title))

    print("[获取问题] ===== 正在保存回答 {} 共有回答数 {} =====".format(
        topic_name, topic_ansnum))

    file_name = str(topic_name) + "_topic.csv"
    file_path = os.path.join("./data", file_name)
    with open(file_path, "a", encoding="utf-8-sig", newline='') as file:
        writer = csv.writer(file)
        for item in question_set:
            writer.writerows([item])
示例#4
0
# installed libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler

# local files
import data_getter
import processing

plt.style.use('ggplot')

x_data, _, data = data_getter.get_data('as7263 mango verbose')
print(data.columns)
print('(((((((')
# x_data = x_data[data['integration time'] == 200]
# x_data = x_data[data['position'] == 'pos 2']
# data = data[data['integration time'] == 200]
# data = data[data['position'] == 'pos 2']
# chloro_data = data.groupby('Leaf number', as_index=True).mean()
data = data.groupby('Leaf number', as_index=True).mean()
data_columns = []
for column in data.columns:
    if 'nm' in column:
        data_columns.append(column)

x_data = data[data_columns]
print(data)
示例#5
0
# installed libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn import linear_model
# local files
import data_getter

# plt.xkcd()
plt.style.use('seaborn')

# data = pd.read_csv('mango_chloro_refl3.csv')
x_data, _, full_data = data_getter.get_data('as7262 betal')

# full_data = pd.read_csv('mango_flouro_rows.csv')
# full_data = pd.read_csv('as7262_mango.csv')
# full_data = full_data.groupby('Leaf number', as_index=True).mean()

# data = data.loc[(data['integration time'] == 250)]
print(full_data.columns)


LEDs = ['White LED', 'IR LED', 'UV (405 nm) LED', '390 nm LED', '395 nm LED',
        '400 nm LED', '405 nm LED', '410 nm LED', '425 nm LED', '455 nm LED',
        '465 nm LED', '470 nm LED', '475 nm LED', '480 nm LED', '505 nm LED',
        '525 nm LED', '630 nm LED', '890 nm LED', '940 nm LED']
LED = LEDs[0]
data_columns = []
示例#6
0
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler

from sklearn import svm
from sklearn.datasets import make_moons, make_blobs
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

import data_getter
import processing

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
x_data, y, data = data_getter.get_data('as7262 betal')
# x_data = processing.snv(x_data)
y = y['Total Chlorophyll (ug/ml)']
print(data.index)
results = pd.DataFrame([], index=data.index)
print(results)
outliers_fraction = 0.1
algorithms = [("Elliptic Envelope",
               EllipticEnvelope(contamination=outliers_fraction)),
              ("One-Class SVM",
               svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)),
              ("Isolation Forest",
               IsolationForest(contamination=outliers_fraction,
                               random_state=42)),
              ("Local Outlier Factor",
               LocalOutlierFactor(n_neighbors=35,
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.svm import LinearSVR, NuSVR, SVR

# local files
import data_getter
import processing

plt.style.use('seaborn')

# data = pd.read_csv('as7262_mango.csv')
# data = data.groupby('Leaf number', as_index=True).mean()

x_data, data = data_getter.get_data("as7262 mango",
                                    remove_outlier=True,
                                    only_pos2=False)

data_columns = []
for column in data.columns:
    if 'nm' in column:
        data_columns.append(column)
print(data_columns)
# x_data = data[data_columns]

y_columns = [
    'Total Chlorophyll (ug/ml)', 'Chlorophyll a (ug/ml)',
    'Chlorophyll b (ug/ml)'
]

invert_y = False
示例#8
0
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler

# local files
import data_getter

plt.style.use('ggplot')

# Prepare data
spectrum_data, chloro_data, full_data = data_getter.get_data("as7262 mango")

print(spectrum_data.columns)
print(chloro_data.columns)
print(full_data.columns)

class ModelFit(object):
    def __init__(self):
        self.test_score = []
        self.test_stdev = []
        self.train_score = []
        self.train_stdev = []

    def add_data(self, data):
        self.test_score.append()
示例#9
0
def main():
    trail_df, weather_df = dg.get_data()
    merged = dg.merge_dataframes(trail_df, weather_df)
    # utils.print_heads(trail_df, weather_df, merged)
    weather_predictor(merged, True)
# installed libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler

# local files
import data_getter
import processing

plt.style.use('ggplot')

x_data, chloro_data, data = data_getter.get_data('as7262 betal')
print(data.columns)
print('(((((((')
# # print(data['integration time'].unique(), data['position'].unique())
# x_data = x_data[data['integration time'] == 200]
# x_data = x_data[data['position'] == 'pos 2']
# data = data[data['integration time'] == 200]
# data = data[data['position'] == 'pos 2']
# chloro_data = data.groupby('Leaf number', as_index=True).mean()

data_columns = []
for column in data.columns:
    if 'nm' in column:
        data_columns.append(column)
print(data_columns)
print(x_data)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# local files
import data_getter
import processing

plt.style.use('seaborn')

# data = pd.read_csv("as7262_mango.csv")

# data = data.loc[(data['position'] == 'pos 2')]
# data = data.loc[(data['integration time'] == 3)]
# data = data.groupby('Leaf number', as_index=True).mean()

x_data, _, data = data_getter.get_data('as7263 roseapple')
chloro_data = data.groupby('Leaf number', as_index=True).mean()

accent_column = chloro_data['Total Chlorophyll (ug/ml)'].to_numpy()
accent_column = accent_column / max(accent_column)
print(accent_column / max(accent_column))

alphas = np.linspace(0.1, 1, 10)
colors = np.zeros((chloro_data.shape[0], 4))
colors[:, 0] = 0.2
colors[:, 1] = 0.6
colors[:, 2] = 0.2
colors[:, 3] = accent_column

spectrum_data_columns = []
wavelengths = []
示例#12
0
from data_getter import get_data

import matplotlib.pyplot as plt

import networkx as nx

LOGIN = '******'
PASSWORD = '******'
graph_data = get_data(LOGIN, PASSWORD)
nodes = []
totals = []
edges = []

for info in graph_data:
    nodes.append(info['login'])
    totals.append(info['total'])

nodes.append(1)

G = nx.Graph()

G.add_nodes_from(nodes)

ebunches = []
for node, total in zip(nodes, totals):
    ebunches.append((node, 1, {'weight': total}))

G.add_edges_from(ebunches)
nx.draw_shell(G, with_labels=True, font_weight='bold')
plt.show()
from sklearn.linear_model import PassiveAggressiveRegressor, ridge_regression
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import learning_curve, ShuffleSplit, KFold
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.svm import LinearSVR, NuSVR, SVR

# local files
import data_getter
import processing

plt.style.use('ggplot')

data, chloro_data = data_getter.get_data('as7263 roseapple')
# chloro_data = data.groupby('Leaf number', as_index=True).mean()
print(data)
print(chloro_data)
data_columns = []
print('=====')
print(data.columns)
for column in data.columns:
    if 'nm' in column:
        data_columns.append(column)
print(data_columns)
x_data = data[data_columns]
print(data.columns)
y_columns = [
    'Total Chlorophyll (ug/ml)', 'Chlorophyll a (ug/ml)',
    'Chlorophyll b (ug/ml)'
            if (is_int(inp)) and int(inp) in range(len(lang_lst)):
                inp = int(inp)
                lang = dict(enumerate(lang_lst))[inp]
                projects_lst = get_tag_projects_by_lang(lang, tag_projects, all_projects)
                title = "Relevant Languages by tag: {tag} and projects regarding the Language: {lang}".format(tag=tag,
                                                                                                              lang=lang)
                make_projects_page(title, "Relevant Languages by Tag", lang_lst, projects_lst, all_projects)
                webbrowser.open('out.htm')
                break
            break


if __name__ == '__main__':
    # main loop
    options = ['Update data', 'Get Popular tags by order', 'Get Popular Programming languages by order']
    while True:
        for j, lang in enumerate(options):
            print(j, lang)
        inp = input("\nyour choice:")
        if (is_int(inp)) and int(inp) in range(len(options)):
            inp = int(inp)
            if inp == 0:
                data_getter.get_data()
                break
            elif inp == 1:
                get_data_by_tag()
                break
            elif inp == 2:
                get_data_by_lang()
                break