continue if id_ in ids: print("duplicate ") continue date_nep = data["date_nepali"].split()[:3] yy = "".join([str(digit(c)) for c in date_nep[0]]) if int(yy) < 2076: print("filtered out article from year ", yy) continue dd = "".join([str(digit(c)) for c in date_nep[2]]) mm = month_map[date_nep[1]] nd = NepaliDate(yy, mm, dd, lang='nep') # print(date_nep, nd) # sys.exit(-1) data["date_english"] = nd.to_english_date().strftime( "%Y/%-m/%d") data["title"] = data["title"].replace(u'\xa0', u' ') data["subtitle"] = data["subtitle"].replace(u'\xa0', u' ') data["description"] = data["description"].replace( u'\xa0', u' ') data["source"] = "onlinekhabar" data["category"] = data["category"].split("/")[-1] # data['category'] = cat_map[data["category"].split("/")[-1]] cats.add(data["category"]) outfile.write(json.dumps(data) + "\n") ids.add(id_) cnt += 1 if cnt % 100 == 0: print("processed ", cnt, " articles") except Exception as ex: print(ex, date_nep, nd)
with open(f, 'r') as fp: for line in fp: data = json.loads(line) try: date_nep = data["date_nepali"].split(",")[1:3] # print(nd) yy = "".join([str(digit(c)) for c in date_nep[1][1:]]) if int(yy) < 2076: print("filtered out article from year ", yy) continue dd = "".join( [str(digit(c)) for c in date_nep[0].strip().split(' ')[1]]) mm = month_map[date_nep[0].strip().split(' ')[0]] nd = NepaliDate(yy, mm, dd, lang='nep') # print(yy, mm, dd, nd, , ) d_eng = nd.to_english_date() month = d_eng.month id_ = data["id"] if month not in {8, 9, 10}: print("filtered out article ", data["url"]) continue if id_ in ids: print("duplicate ") continue data["date_english"] = d_eng.strftime("%Y/%-m/%d") data["title"] = data["title"].replace(u'\xa0', u' ') data["subtitle"] = data["subtitle"].replace(u'\xa0', u' ') data["description"] = data["description"].replace( u'\xa0', u' ') data["source"] = "setopati" data["category"] = data["category"]
def convert_to_english(date): year, month, day = [int(s) for s in date.split("-") if s.isdigit()] nepali_date = NepaliDate(year, month, day) en_date = nepali_date.to_english_date() return en_date