Пример #1
0
def merge_json():
    final_dict = {}
    dept_dict = read_json("year_dept.json")
    category_dict = read_json("year_category.json")
    for each in dept_dict:
        each["CREATION YEAR"] = str(each["CREATION YEAR"])
        each["ZIP CODE"] = str(int(each["ZIP CODE"]))

        if each["CREATION YEAR"] not in final_dict.keys():
            final_dict[each["CREATION YEAR"]] = {}
        if each["ZIP CODE"] not in final_dict[each["CREATION YEAR"]].keys():
            final_dict[each["CREATION YEAR"]][each["ZIP CODE"]] = {}
        if "DEPARTMENT" not in final_dict[each["CREATION YEAR"]][each["ZIP CODE"]].keys():
            final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["DEPARTMENT"] = []
        dept = {"name": each["DEPARTMENT"], "count": each["COUNT"]}
        final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["DEPARTMENT"].append(dept)

    for each in category_dict:
        each["CREATION YEAR"] = str(each["CREATION YEAR"])
        each["ZIP CODE"] = str(int(each["ZIP CODE"]))
        if each["CREATION YEAR"] not in final_dict.keys():
            final_dict[each["CREATION YEAR"]] = {}
        if each["ZIP CODE"] not in final_dict[each["CREATION YEAR"]].keys():
            final_dict[each["CREATION YEAR"]][each["ZIP CODE"]] = {}
        if "CATEGORY" not in final_dict[each["CREATION YEAR"]][each["ZIP CODE"]].keys():
            final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["CATEGORY"] = []
        dept = {"name": each["CATEGORY"], "count": each["COUNT"]}
        final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["CATEGORY"].append(dept)
    write_json(final_dict, "final.json")
Пример #2
0
def process_time_series_data():
    print("values.")
    data_311 = pd.read_csv(final_data)
    data_311 = data_311[data_311['CREATION YEAR'] >= 2015]
    data_311['DAYS TO CLOSE'] = data_311['DAYS TO CLOSE'].apply(lambda x: str(x).replace(",", ""))
    data_311['DAYS TO CLOSE'] = data_311['DAYS TO CLOSE'].astype("float64")
    time_series = data_311.groupby(by=['CREATION YEAR', 'CREATION MONTH', 'ZIP CODE', 'DEPARTMENT'])[
        'DAYS TO CLOSE'].mean().reset_index().to_dict('records')
    write_json(time_series, "time_series.json")
Пример #3
0
def process_final_data():
    data_311 = pd.read_csv(final_data)
    data_311 = data_311[data_311['CREATION YEAR'] >= 2015]
    list_records_dept = data_311.groupby(by=['CREATION YEAR', 'ZIP CODE', 'DEPARTMENT'])['CASE ID'].count().reset_index(
        name='COUNT').to_dict('records')
    list_records_category = data_311.groupby(by=['CREATION YEAR', 'ZIP CODE', 'CATEGORY'])[
        'CASE ID'].count().reset_index(
        name='COUNT').to_dict('records')

    write_json(list_records_dept, "year_dept.json")
    write_json(list_records_category, "year_category.json")
Пример #4
0
def save_result_json(output_path, imgname, res):
    """
        把生成的坐标和图片保存
    """
    img_op = os.path.join(output_path, "images")
    file_utils.check_path(img_op)
    msg_op = os.path.join(output_path, "message")
    file_utils.check_path(msg_op)
    txt_op = os.path.join(output_path, "labels")
    file_utils.check_path(txt_op)

    ninstance = len(res)
    for i in range(ninstance):
        print(colorize(Color.GREEN, 'added into the db %s ' % res[i]['txt']))
        temp = res[i]
        img = temp['img']
        word_bb = temp['wordBB']
        char_bb = temp['charBB']
        txt = temp['txt']
        # TODO 有换行符的是两个box
        new_text = []
        for line in txt:
            arr = line.split("\n")
            new_text.extend(arr)
        # (n,4,2) 把顺序调整为正常
        word_boxes = np.transpose(word_bb).astype('uint8')
        #
        dname = "%s_%d" % (imgname, i) + ".jpg"
        print(dname, new_text)
        img_file = os.path.join(img_op, dname)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        cv2.imwrite(img_file, img)
        result_json = {
            'text': new_text,
            'word_pos': word_bb.tolist(),
            'char_pos': char_bb.tolist()
        }
        json_utils.write_json(msg_op, dname, result_json)
        # TODO 统一格式
        # 写 labels 四点坐标+ 文本,逗号隔开
        txt_name = os.path.splitext(dname)[0] + ".txt"
        f2 = open(os.path.join(txt_op, txt_name), 'w', encoding='utf-8')
        writer = csv.writer(f2)
        for j in range(len(txt)):
            box = word_boxes[j]
            word = new_text[j]
            line = np.append(box.reshape(-1), word)
            writer.writerow(line)
        f2.close()
Пример #5
0
def process_dept_df():
    counter = 0
    dept_df = final_data[['CASE ID', 'DESCRIPTION', 'DEPARTMENT']]
    for each in list(dept_df.DEPARTMENT.unique()):
        dept_category[str(counter)] = each
        counter += 1
    write_json(dept_category, dept_json)
    print('finished')
    dept_df['label'] = dept_df.DEPARTMENT.apply(apply_dept_label)
    dept_df.rename(columns={
        'CASE ID': 'u_id',
        'DESCRIPTION': 'desc'
    },
                   inplace=True)
    write_to_csv(dept_df, dept_file)
Пример #6
0
def process_prob_df():
    counter = 0
    print(list(final_data['REQUEST TYPE'].unique()))
    final_data['label'] = final_data['REQUEST TYPE'].apply(apply_prob)
    print(list(final_data.label.unique()))
    for each in list(final_data.label.unique()):
        prob_category[str(counter)] = each
        counter += 1
    write_json(prob_category, prob_json)
    final_data['label'] = final_data['label'].apply(apply_prob_label)
    prob_df = final_data[['CASE ID', 'DESCRIPTION', 'label']]
    prob_df.rename(columns={
        'CASE ID': 'u_id',
        'DESCRIPTION': 'desc'
    },
                   inplace=True)
    write_to_csv(prob_df, prob_file)
    print('finished')
Пример #7
0
    def save_log(self, filename, env_to_file=None):
        assert not filename is None, "filename not supplied. save_log failed"

        lg = deepcopy(self.log)
        for _, drone_log in lg["drones"].items():
            drone_log["info"] = drone_log["info"].to_JSONable()
            for step, step_log in drone_log["trajectory"].items():
                drone_log["trajectory"][step] = step_log.to_JSONable()

        if isinstance(self.log["environment"], str):
            lg["environment"] = {"path": self.log["environment"]}
        else:
            if env_to_file is None:
                lg["environment"] = self.log["environment"].to_JSONable()
            else:
                lg["environment"] = {"path": env_to_file}
                write_json(env_to_file, self.log["environment"].to_JSONable())

        write_json(filename, lg)
Пример #8
0
def get_movie_info():
    headers = {'User-Agent': 'Mozilla/5.0 xxxxxx'}
    basel = 'https://movie.douban.com/subject/1292213/'
    html = requests.get(basel,
                        headers=headers).content.decode('utf-8', 'ignore')
    url_content = re.search(
        r'"@context": "http://schema.org",(.*?)"ratingValue": "9.2"', html,
        re.S)
    texts = url_content.group()  # 获取匹配正则表达式的整体结果
    texts = str("{" + texts + "}}")
    # important
    data = json.loads(texts, strict=False)
    movie_info = {
        'name': data['name'],
        'author': data['author'],
        'actor': data['actor'],
        'director': data['director']
    }
    print(movie_info)
    ju.write_json(data, r'data/data.json')
Пример #9
0
def process_json():
    time_series_dict = read_json("time_series.json")
    final_dict = {}
    for each in time_series_dict:
        each["CREATION YEAR"] = str(each["CREATION YEAR"])
        each["ZIP CODE"] = str(int(each["ZIP CODE"]))
        each["CREATION MONTH"] = str(each["CREATION MONTH"])

        if each["CREATION YEAR"] not in final_dict.keys():
            final_dict[each["CREATION YEAR"]] = {}
        if each["ZIP CODE"] not in final_dict[each["CREATION YEAR"]].keys():
            final_dict[each["CREATION YEAR"]][each["ZIP CODE"]] = {}
        if "DEPARTMENT" not in final_dict[each["CREATION YEAR"]][each["ZIP CODE"]].keys():
            final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["DEPARTMENT"] = []

        dept = {
            "CREATION MONTH": each["CREATION MONTH"],
            "NAME": each["DEPARTMENT"],
            "COUNT": each["DAYS TO CLOSE"]
        }
        final_dict[each["CREATION YEAR"]][each["ZIP CODE"]]["DEPARTMENT"].append(dept)
    write_json(final_dict, "time_series_final.json")
Пример #10
0
                    if DBG:
                        print "Item satisfies auto_contact_sms for filter but already contacted", f
            if f.satisfies_auto_contact_email(item):
                if DBG:
                    print "Item satisfies auto_contact_email for filter", f
                if not already_auto_contacted_email(item):
                    # Auto contact
                    if not auto_contact_email(item, f):
                        printe("ERROR, something went wrong while trying to auto_contact_email")
                else:
                    if DBG:
                        print "Item satisfies auto_contact_email for filter but already contacted", f


f_existing_items = open(FILTERED_ITEMS_FILEPATH, 'w+')
total_items = already_existing_items.values() + new_items

# While debugging, use pretty-printed JSON, when not debugging anymore, use compact notation for space efficiency
if DBG:
    json_repr = json.dumps(total_items, indent=4, separators=(',', ': '))
else:
    json_repr = json.dumps(total_items)
f_existing_items.write(json_repr)
f_existing_items.close()
                
write_json(passed_alerts, PASSED_ALERTS_FILEPATH)
                
write_json(passed_mail_contacts, PASSED_MAILS_FILEPATH)
                
write_json(passed_sms_contacts, PASSED_SMS_FILEPATH)
 def dump_data(self):
     print "Dumping data to %s" % self.fname_log_items
     write_json(self.log_items, self.fname_log_items)
     print "Dumping data to %s" % self.fname_serp_items
     write_json(self.serp_items, self.fname_serp_items)
Пример #12
0
for word in segment:
    if word.strip() not in stopwords:
        if len(word) > 1:
            if word != '\t':
                if word != '\r\n':
                    # 计算词频
                    if word in word_:
                        word_[word] += 1

                    else:
                        word_[word] = 1

print(word_)
len(word_)
del word_['nbsp']
ju.write_json(word_, os.getcwd() + r"/test/text_data.json")
word_ = sorted(word_.items(), key=lambda x: x[1], reverse=True)
dic_temp = {}
for word in word_:
    dic_temp[word[0]] = word[1]
ju.write_json(dic_temp, os.getcwd() + r"/test/text_data.json")
dic_temp["姜明"]
for word in word_:
    name.append(word[0])
    value.append(word[1])

name.index("雪艳姐")
value[184]
generatepath = os.getcwd() + r"/test/test_cloud.html"
name[:200]
value[:200]
Пример #13
0
 def close_spider(self, spider):
     print "Dumping data to %s" % self.fname
     write_json(self.items, self.fname)