Exemplo n.º 1
0
    def __init__(self, d):
        yyyymmdd = map(int, self.__date_re.match(d["date"]).groups())
        self.date = datetime.date(*yyyymmdd)
        self.url = util.unicode_to_ascii(d["url"])
        self.title = util.unicode_to_ascii(d.get("title", ""))
        self.body = util.unicode_to_ascii(d.get("body", ""))

        if self.title == "" and self.body != "":
            self.title = self.body.splitlines()[0]
        elif self.body == "" and self.title != "":
            self.body = self.title
        elif self.body != "" and self.title != "":
            pass
        else:
            util.Log("%s epic-fail" % (self.url))

        return
Exemplo n.º 2
0
    def __init__(self, title, lead, text):
        title, lead, text = [util.unicode_to_ascii(x) for x in [title, lead, text]]
        # lead, text = [x.lower() for x in [lead, text]]

        self.title = title
        self.lead = lead
        self.text = text
        return
Exemplo n.º 3
0
 def _prepare_cache_key(self, key):
     """
     Ensures a key is valid for memcached by converting to ascii if
     necessary and replacing spaces. Returns the modified key,
     or raises a ValueError if the key is empty or too long.
     """
     try:
         ascii_key = key if isinstance(key, str) else key.encode("ascii")
     except UnicodeEncodeError:
         ascii_key = unicode_to_ascii(key)
     ascii_key = ascii_key.replace(" ", "_")
     if not ascii_key:
         raise ValueError("Cache key is empty")
     if not ascii_key or len(ascii_key) > SERVER_MAX_KEY_LENGTH:
         raise ValueError("Cache key is too long: " + ascii_key)
     return ascii_key
def readData():

    trcount = dict()
    tecount = dict()

    trdata = list()
    trlabel = list()
    tedata = list()
    telabel = list()

    for f in tr_list_files:
        if not f.endswith(".txt"):
            continue
        with open(TR_DOC_DIR + '/' + f) as fin:
            text = json.load(fin)
            target = text['section'].encode("utf-8").replace("\xe2\x80\x99", "'")
            # if target in art:
            #     target = 'art'
            if target in sport:
                target = 'sport'
            elif target in business:
                target = 'business'
            elif target in tech:
                target = 'tech'
            elif target in entertain:
                target = 'entertain'
            elif target in opinion:
                target = 'opinion'
            elif target in world:
                target = 'world'
            elif target in us:
                target = 'us'
            else:
                continue
            if target not in trcount:
                trcount[target] = 0
            trcount[target] = trcount[target] + 1

            label = category_to_label(target)
            content = tokenize(unicode_to_ascii(text['text'].encode('utf-8')))

            if target not in CORPUS:
                CORPUS[target] = []
            CORPUS[target].append(" ".join(content))
            # for word in content:
            #     WORD_DICT.add(word)
            trdata.append(content)
            trlabel.append(label)

    for f in te_list_files:
        if not f.endswith(".txt"):
            continue
        with open(TE_DOC_DIR + '/' + f) as fin:
            text = json.load(fin)
            target = text['section'].encode("utf-8").replace("\xe2\x80\x99", "'")
            # if target in art:
            #     target = 'art'
            if target in sport:
                target = 'sport'
            elif target in business:
                target = 'business'
            elif target in tech:
                target = 'tech'
            elif target in entertain:
                target = 'entertain'
            elif target in opinion:
                target = 'opinion'
            elif target in world:
                target = 'world'
            elif target in us:
                target = 'us'
            else:
                continue
            if target not in tecount:
                tecount[target] = 0
            tecount[target] = tecount[target] + 1

            label = category_to_label(target)
            content = tokenize(unicode_to_ascii(text['text'].encode('utf-8')))
            # for word in content:
            #     WORD_DICT.add(word)

            tedata.append(content)
            telabel.append(label)

    print("------Training data------")
    print(trcount)
    print("------Testing data------")
    print(tecount)
    # print(CORPUS['sport'])
    return trdata, trlabel, tedata, telabel
Exemplo n.º 5
0
def test_Given_unicodeName_When_unicodeToAscii_Then_returnAsciiName(
        input, output):
    assert unicode_to_ascii(input) == output
Exemplo n.º 6
0
 def __str__(self):
     filename = unicode_to_ascii(self.filename(with_path=False))
     if self._db_id > 0:
         return filename + ' ' + self.get_cache_key()
     else:
         return filename
Exemplo n.º 7
0
def classify(f_dir):

    model = pkl.load(open(model_file, 'rb'))
    features = pkl.load(open(features_file, 'rb'))

    outputs = list()

    files = os.listdir(f_dir)

    for f in files:
        output = dict()
        traindata = list()

        if not f.endswith(".txt"):
            continue

        with open(f_dir + "/" + f) as fin:
            text = json.load(fin)
            # if source == "NYT":
            output["title"] = unicode_to_ascii(text["title"].encode("utf-8"))
            output["date"] = get_time(
                unicode_to_ascii(text["time"].encode("utf-8")))
            output["url"] = unicode_to_ascii(text["url"].encode("utf-8"))
            output["source"] = unicode_to_ascii(text["source"].encode("utf-8"))
            # elif source == "FOX":
            #     output["title"] = unicode_to_ascii(text["title"].encode("utf-8"))
            #     output["date"] = get_time(unicode_to_ascii(text["time"].encode("utf-8")))
            #     output["url"] = unicode_to_ascii(text["url"].encode("utf-8"))
            #     output["source"] = source
            # else:
            #     output["title"] = unicode_to_ascii(text["title"].encode("utf-8"))
            #     output["date"] = get_time(unicode_to_ascii(text["tile"].encode("utf-8")))
            #     output["url"] = unicode_to_ascii(text["url"].encode("utf-8"))
            #     output["source"] = source
            content = tokenize(unicode_to_ascii(text["text"].encode("utf-8")))
            if content is None or len(content) == 0:
                print("File %s has no text" % f)
                continue

            raw_text = unicode_to_ascii(text["text"].encode("utf-8"))
            indexes = [i.start() for i in re.finditer(r"\.", raw_text)]
            if len(indexes) == 0:
                continue

            if len(indexes) <= 3:
                stop = len(indexes) - 1
            else:
                stop = 3
            output["snippet"] = raw_text[0:indexes[stop] + 1]

            traindata.append(content)

        X_arr = encoding(traindata, features)
        probs = model.predict_proba(X_arr).tolist()
        Y = np.argmax(probs[0])

        output["category"] = label_to_category(Y)
        output["score"] = probs[0][Y]
        outputs.append(output)

    return outputs