def __init__(self, d): yyyymmdd = map(int, self.__date_re.match(d["date"]).groups()) self.date = datetime.date(*yyyymmdd) self.url = util.unicode_to_ascii(d["url"]) self.title = util.unicode_to_ascii(d.get("title", "")) self.body = util.unicode_to_ascii(d.get("body", "")) if self.title == "" and self.body != "": self.title = self.body.splitlines()[0] elif self.body == "" and self.title != "": self.body = self.title elif self.body != "" and self.title != "": pass else: util.Log("%s epic-fail" % (self.url)) return
def __init__(self, title, lead, text): title, lead, text = [util.unicode_to_ascii(x) for x in [title, lead, text]] # lead, text = [x.lower() for x in [lead, text]] self.title = title self.lead = lead self.text = text return
def _prepare_cache_key(self, key): """ Ensures a key is valid for memcached by converting to ascii if necessary and replacing spaces. Returns the modified key, or raises a ValueError if the key is empty or too long. """ try: ascii_key = key if isinstance(key, str) else key.encode("ascii") except UnicodeEncodeError: ascii_key = unicode_to_ascii(key) ascii_key = ascii_key.replace(" ", "_") if not ascii_key: raise ValueError("Cache key is empty") if not ascii_key or len(ascii_key) > SERVER_MAX_KEY_LENGTH: raise ValueError("Cache key is too long: " + ascii_key) return ascii_key
def readData(): trcount = dict() tecount = dict() trdata = list() trlabel = list() tedata = list() telabel = list() for f in tr_list_files: if not f.endswith(".txt"): continue with open(TR_DOC_DIR + '/' + f) as fin: text = json.load(fin) target = text['section'].encode("utf-8").replace("\xe2\x80\x99", "'") # if target in art: # target = 'art' if target in sport: target = 'sport' elif target in business: target = 'business' elif target in tech: target = 'tech' elif target in entertain: target = 'entertain' elif target in opinion: target = 'opinion' elif target in world: target = 'world' elif target in us: target = 'us' else: continue if target not in trcount: trcount[target] = 0 trcount[target] = trcount[target] + 1 label = category_to_label(target) content = tokenize(unicode_to_ascii(text['text'].encode('utf-8'))) if target not in CORPUS: CORPUS[target] = [] CORPUS[target].append(" ".join(content)) # for word in content: # WORD_DICT.add(word) trdata.append(content) trlabel.append(label) for f in te_list_files: if not f.endswith(".txt"): continue with open(TE_DOC_DIR + '/' + f) as fin: text = json.load(fin) target = text['section'].encode("utf-8").replace("\xe2\x80\x99", "'") # if target in art: # target = 'art' if target in sport: target = 'sport' elif target in business: target = 'business' elif target in tech: target = 'tech' elif target in entertain: target = 'entertain' elif target in opinion: target = 'opinion' elif target in world: target = 'world' elif target in us: target = 'us' else: continue if target not in tecount: tecount[target] = 0 tecount[target] = tecount[target] + 1 label = category_to_label(target) content = tokenize(unicode_to_ascii(text['text'].encode('utf-8'))) # for word in content: # WORD_DICT.add(word) tedata.append(content) telabel.append(label) print("------Training data------") print(trcount) print("------Testing data------") print(tecount) # print(CORPUS['sport']) return trdata, trlabel, tedata, telabel
def test_Given_unicodeName_When_unicodeToAscii_Then_returnAsciiName( input, output): assert unicode_to_ascii(input) == output
def __str__(self): filename = unicode_to_ascii(self.filename(with_path=False)) if self._db_id > 0: return filename + ' ' + self.get_cache_key() else: return filename
def classify(f_dir): model = pkl.load(open(model_file, 'rb')) features = pkl.load(open(features_file, 'rb')) outputs = list() files = os.listdir(f_dir) for f in files: output = dict() traindata = list() if not f.endswith(".txt"): continue with open(f_dir + "/" + f) as fin: text = json.load(fin) # if source == "NYT": output["title"] = unicode_to_ascii(text["title"].encode("utf-8")) output["date"] = get_time( unicode_to_ascii(text["time"].encode("utf-8"))) output["url"] = unicode_to_ascii(text["url"].encode("utf-8")) output["source"] = unicode_to_ascii(text["source"].encode("utf-8")) # elif source == "FOX": # output["title"] = unicode_to_ascii(text["title"].encode("utf-8")) # output["date"] = get_time(unicode_to_ascii(text["time"].encode("utf-8"))) # output["url"] = unicode_to_ascii(text["url"].encode("utf-8")) # output["source"] = source # else: # output["title"] = unicode_to_ascii(text["title"].encode("utf-8")) # output["date"] = get_time(unicode_to_ascii(text["tile"].encode("utf-8"))) # output["url"] = unicode_to_ascii(text["url"].encode("utf-8")) # output["source"] = source content = tokenize(unicode_to_ascii(text["text"].encode("utf-8"))) if content is None or len(content) == 0: print("File %s has no text" % f) continue raw_text = unicode_to_ascii(text["text"].encode("utf-8")) indexes = [i.start() for i in re.finditer(r"\.", raw_text)] if len(indexes) == 0: continue if len(indexes) <= 3: stop = len(indexes) - 1 else: stop = 3 output["snippet"] = raw_text[0:indexes[stop] + 1] traindata.append(content) X_arr = encoding(traindata, features) probs = model.predict_proba(X_arr).tolist() Y = np.argmax(probs[0]) output["category"] = label_to_category(Y) output["score"] = probs[0][Y] outputs.append(output) return outputs