예제 #1
0
def parse_line(line):
    vid, title, comment = line[:-1].split('~')
    title = util.clean_text(title)
    comment = util.clean_text(comment)
    title_ix = split_to_words(title)
    if len(title_ix) == 0:
        return [], []
    comment_ix = split_to_words(comment)
    return title_ix, comment_ix
예제 #2
0
파일: editor.py 프로젝트: shaurz/devo
    def LoadFile(self, path):
        self.SetReadOnly(True)
        self.Disable()

        old_path = self.path
        self.path = path
        self.sig_title_changed.signal(self)

        try:
            text = (yield async_call(read_file, path, "r"))
            text, self.file_encoding = decode_text(text)
            text = clean_text(text)

            self.modified_externally = False
            self.SetReadOnly(False)
            self.SetSyntaxFromFilename(path)
            self.SetText(text)
            self.SetSavePoint()

            if old_path:
                self.env.remove_monitor_path(old_path)
            self.env.add_monitor_path(path)
        except:
            self.path = old_path
            self.sig_title_changed.signal(self)
            self.sig_status_changed.signal(self)
            raise
        finally:
            self.Enable()
            self.SetReadOnly(False)
예제 #3
0
def classify(input_file, delimiter=",", classifier_param="LinearSVC"):
    data = []
    with open(input_file, 'rU') as f:
        reader = csv.reader(f, delimiter=delimiter)
        next(reader, None)  # skip the headers
        for row in reader:
            person = row[2]
            text = row[3]
            likes = row[5]
            data.append((text, person, likes))

    try:
        classifier = joblib.load("model/%s_classifier.pkl" % classifier_param)
    except IOError:
        print "unable to load classifier: %s. Exiting program." % classifier_param
        sys.exit(1)

    with open("results/%s_%s" %(classifier_param, input_file.replace("/","-")), 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t', quotechar='\"', quoting=csv.QUOTE_ALL)
        writer.writerow(["label", "message", "author", "likes"])

        for message, author, likes in data:
            cleaned_message = util.clean_text(message, True)
            if len(cleaned_message.split(" ")) > 3:
                writer.writerow([util.classify_unknown(classifier, cleaned_message), message, author, likes])
예제 #4
0
    def get_action(hit):
        address = clean_text(hit['_source']['address'])

        try:
            results = geocoding.get(address)
        except Exception as ex:
            print address, 'Error:', ex
            return batch.get_update_action(
                hit, {
                    'address': address,
                    'geocoding_data': None,
                    'location_error': 'NotFound'
                })

        n = len(results)
        doc = {}

        if n > 0:
            geo = results[0]
            doc = {
                'formatted_address': geo['formatted_address'],
                'location': {
                    'lat': geo['geometry']['location']['lat'],
                    'lon': geo['geometry']['location']['lng']
                }
            }

        if n != 1:
            doc['location_error'] = 'NotFound' if n == 0 else 'Ambiguous'

        doc['geocoding_data'] = results
        doc['address'] = address

        return batch.get_update_action(hit, doc)
예제 #5
0
    def parse_file(self, filename, with_topic=False):
        """
        Reads the text file and returns a dictionary in the form:
        tweet_id = (sentiment, text)
        :param with_topic:
        :param filename: the complete file name
        :return:
        """
        # print(filename)
        data = {}
        filename_print_friendly = filename.split("/")[-1].split("\\")[-1]

        if self.verbose:
            print("Parsing file:", filename_print_friendly, end=" ")
        for line_id, line in enumerate(
                open(filename, "r", encoding="utf-8").readlines()):

            try:
                columns = line.rstrip().split(self.SEPARATOR)
                tweet_id = columns[0]

                if with_topic:
                    topic = clean_text(columns[1])
                    if not isinstance(topic, str) or "None" in topic:
                        print(tweet_id, topic)
                    sentiment = columns[2]
                    text = clean_text(" ".join(columns[3:]))
                    if self.ekphrasis:
                        text = ' '.join(text_processor.pre_process_doc(text))
                    if text != "Not Available":
                        data[tweet_id] = (sentiment, (topic, text))
                else:
                    sentiment = columns[1]
                    text = clean_text(" ".join(columns[2:]))
                    if self.ekphrasis:
                        text = ' '.join(text_processor.pre_process_doc(text))
                    if text != "Not Available":
                        data[tweet_id] = (sentiment, text)

            except Exception as e:
                print("\nWrong format in line:{} in file:{}".format(
                    line_id, filename_print_friendly))
                raise Exception

        if self.verbose:
            print("done!")
        return data
예제 #6
0
def parse_line(line):
    iid, pid, tags, comment = line[:-1].split('~')
    comment = util.clean_text(comment)
    tags = tags.lower()
    tags_ix = tags.split("#*#")
    if len(tags_ix) == 0:
        return [], []
    comment_ix = split_to_words(comment)
    return tags_ix, comment_ix
예제 #7
0
 def Paste(self):
     wx.TheClipboard.Open()
     try:
         text_data = wx.TextDataObject()
         if wx.TheClipboard.GetData(text_data):
             text = clean_text(text_data.GetText())
             self.ReplaceSelection(text)
     finally:
         wx.TheClipboard.Close()
예제 #8
0
def generate_titles(my_title):
	my_title = util.clean_text(my_title)
	my_words = my_title.split(' ')
	print(' '.join((w.upper() if w in title_word_to_ix else w) for w in my_words) + '\n')
	my_title_ixs = [title_word_to_ix[w] for w in my_words if w in title_word_to_ix]
	my_title_sample = util.bag_of_words(my_title_ixs, title_dict_size)
	for i in range(10):
		print('  ' + word_ixs_to_str(pred_text(model, my_title_sample), False))
	print('')
예제 #9
0
def create_model(pos_tweets, neg_tweets, neu_tweets, classifier_param='LinearSVC'):

    # filter away words that are less than 3 letters to form the training training_data
    tweets = []
    for (words, sentiment) in pos_tweets + neg_tweets + neu_tweets:
        words = util.clean_text(words, True)
        words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
        #words_filtered = [' '.join(w) for w in [ x for x in nltk.bigrams(words.split())]]
        tweets.append((words_filtered, sentiment))

    # make sure tweets are shuffled randomly
    shuffle(tweets)

    # get the training set and train the Classifier
    training_set = nltk.classify.util.apply_features(extract_features, tweets)

    max_specificity = -1
    best_classifier = None
    average_accuracy = 0.0

    # perform 10-fold cross validation
    cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=False, random_state=None)
    for traincv, testcv in cv:

        if classifier_param == "LinearSVC":
            classifier = SklearnClassifier(LinearSVC()).train(training_set[traincv[0]:traincv[len(traincv)-1]])
        elif classifier_param == "Tfid":
            # does TF-IDF weighting,
            # chooses the 1000 best features based on a chi2 statistic,
            # and then passes that into a multinomial naive Bayes classifier.
            pipeline = Pipeline([('tfidf', TfidfTransformer()), \
                                   ('chi2', SelectKBest(chi2, k=1000)), \
                                   ('nb', MultinomialNB())])
            classifier = SklearnClassifier(pipeline).train(training_set[traincv[0]:traincv[len(traincv)-1]])
        elif classifier_param == "Bernoulli":
            classifier = SklearnClassifier(BernoulliNB()).train(training_set[traincv[0]:traincv[len(traincv)-1]])
        elif classifier_param == "NaiveBayes":
            classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv)-1]])
        else:
            print "Classifier option not available: ", classifier_param
            sys.exit(1)

        accuracy_of_classifier, specificity = \
            util.accuracy(classifier, tweets[testcv[0]:testcv[len(testcv)-1]])

        average_accuracy += accuracy_of_classifier
        if specificity > max_specificity:
            max_specificity = specificity
            best_classifier = classifier

    print "\naverage accuracy: ", average_accuracy/cv.n_folds

    # save the classifier
    joblib.dump(best_classifier, "model/%s_classifier.pkl" % classifier_param)

    print "saved classifier"
예제 #10
0
def get_data(data_path, min_count, use_number_norm, embed_path):
    data_ans = json.load(open(data_path, encoding='utf-8'))
    data = replace_text(clean_text(data_ans), use_number_norm)
    new_data = rep_text(data)
    new_data = [(line[0], line[2]) for line in new_data]  # 2 是BIO 1 是 BIEOS
    char2idx = build_char_vocab(new_data)
    vocab, word2idx, idx2word, label2index, index2label = build_vocab(
        new_data, min_count)
    pretrain_word_embedding, unk_words = build_pretrain_embedding(
        embedding_path=embed_path, word_index=word2idx)
    return new_data, pretrain_word_embedding, vocab, word2idx, idx2word, label2index, index2label, char2idx
예제 #11
0
파일: editor.py 프로젝트: shaurz/devo
    def WriteFile(self, path):
        def do_write_file(path, text):
            mkpath(os.path.dirname(path))
            atomic_write_file(path, text)

        text, self.file_encoding = encode_text(self.GetText(), self.file_encoding)
        text = clean_text(text)
        with self.env.updating_path(path):
            yield async_call(do_write_file, path, text)
        self.modified_externally = False
        self.SetSavePoint()
예제 #12
0
파일: render.py 프로젝트: tbug/csigerstop
    def on_get(self, req, resp):
        time_response_begin = time.time()

        is_thumbnail = bool(req.get_param("thumbnail"))

        if is_thumbnail:
            cache = self.thumbnail_cache
            layout = self.thumbnail_layout
        else:
            cache = self.image_cache
            layout = self.image_layout

        image_data = None

        # cache hack
        if req.get_header("If-None-Match") and req.get_header("Cache-Control") != "no-cache":
            resp.status = falcon.HTTP_304
            return

        text = clean_text(req.get_param("text", True))
        if len(text) > TEXT_MAX_LEN:
            resp.status = falcon.HTTP_403
            resp.body = "Text to long."
            return

        # if text has no newline, assume space is the newline
        if "\n" not in text:
            text = re.sub(r" +", "\n", text)
        text = re.sub(r"_+", " ", text)

        image_data = cache.get(text)
        cache_hit = image_data is not None
        if not cache_hit:
            time_render_begin = time.time()  # stats
            with layout.base_image.clone() as canvas:
                self.draw_text(layout, canvas, text)
                image_data = canvas.make_blob("png")
            cache.set(text, image_data)
            render_time = time.time() - time_render_begin  # stats
            self.latest_cache_miss_times.append(render_time)  # stats

        resp.set_header("Cache-Control", "public, max-age=3600")
        resp.set_header("Content-Type", "image/png")
        resp.set_header("ETag", hashlib.md5(text.encode("utf-8")).hexdigest())
        resp.body = image_data

        now = time.time()
        if cache_hit:
            cache_hit_time = time.time() - time_response_begin
        response_time = now - time_response_begin

        self.latest_response_times.append(response_time)
        if cache_hit:
            self.latest_cache_hit_times.append(cache_hit_time)
예제 #13
0
def write_tweet_info(tweet_object, options):
    debug_print("Writing tweet info")
    output_path = Path(options['output_dir']) / options['search_name'] / (options['search_name'] + '.txt')
    with output_path.open('a') as output_file:
        output_file.write('Tweet info:\n')
        output_file.write('\tID:{}\n'.format(tweet_object['id_str']))
        output_file.write('\tCreated at:{}\n'.format(tweet_object['created_at']))
        output_file.write('\tAuthor:{}\n'.format(tweet_object['user']['screen_name']))
        output_file.write('\tText:{}\n'.format(clean_text(tweet_object['text'])))
        try:
            output_file.write('\tURL:{}\n'.format(tweet_object['entities']['urls'][0]['expanded_url']))
        except Exception:
            pass
예제 #14
0
    def clean(src):
        src['name'] = clean_text(src['name'])
        src['mission'] = deep_try_get(src, 'mission', 'gs_data.mission',
                                      'fb_data.mission')
        src['city'] = deep_try_get(src, 'city', 'source_data.city',
                                   'gs_data.city')
        src['state'] = deep_try_get(src, 'state', 'gs_data.state')
        src['link'] = deep_try_get(src, 'link', 'source_data.website',
                                   'gs_data.website')

        keywords = deep_try_get(src, 'keywords', 'gs_data.exchange.keywords')
        if keywords:
            src['keywords'] = clean_list(keywords.split(','))

        if 'areas' in src:
            src['areas'] = clean_list(src['areas'])

        value = deep_get(src, 'gs_data.geographic_areas_served')
        if value:
            deep_set(src, 'gs_data.geographic_areas_served', clean_list(value))

        value = deep_get(src, 'gs_data.organization_ntee_codes')
        if value:
            deep_set(src, 'gs_data.organization_ntee_codes', clean_list(value))

        value = deep_get(src, 'tw_data.created_at')
        if value:
            deep_set(src, 'tw_data.created_at', parse_date(value).isoformat())

        value = deep_get(src, 'tw_data.status.created_at')
        if value:
            deep_set(src, 'tw_data.last_update_at',
                     parse_date(value).isoformat())
            del src['tw_data']['status']['created_at']

        value = deep_get(src, 'fb_data.location')
        if value:
            deep_set(src, 'fb_data.address', value)
            del src['fb_data']['location']

        value = deep_get(src, 'fb_data.address.latitude')
        if value:
            deep_set(src, 'fb_data.location.lat', value)

        value = deep_get(src, 'fb_data.address.longitude')
        if value:
            deep_set(src, 'fb_data.location.lon', value)

        if 'isSiteDown' in src:
            src['is_site_down'] = src['isSiteDown']
            del src['isSiteDown']
예제 #15
0
def predict_sentiment(texts):
    model = load_model()
    tokenizer = load_tokenizer()

    for text in texts:
        texts[texts.index(text)] = clean_text(text)

    texts = np.array(texts)

    sequences = tokenizer.texts_to_sequences(texts)

    data = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=50)

    prediction = model.predict(data)
    keras.backend.clear_session()
    return prediction.tolist()
예제 #16
0
def sentiment_analysis_result(input_review, *args, **kwargs):
    # TODO: returm sentiment analysis on input tweet
    # 1: positive, 0 negative
    modelfilename, bowTransformefilename, TfidfTransformefilenam = args

    Model = pickle.load(open(modelfilename, "rb"))
    bow_transformer = pickle.load(open(bowTransformefilename, "rb"))
    tfidf_transformer = pickle.load(open(TfidfTransformefilenam, "rb"))

    input_review = clean_text(input_review)
    input_review = [input_review]  #convert it to the list because transform needs a iterable object

    messages_bow_test = bow_transformer.transform(input_review)
    messages_transformer_test = tfidf_transformer.transform(messages_bow_test)
    result = Model.predict(messages_transformer_test)
    return result[0]
예제 #17
0
def embedding_route():
    response = {
        "text": None,
        "embeddings": None,
        "err": None
    }
    try:
        a = request.args.get("txt")
        x = clean_text(a)

        response['text'] = a
        response['embeddings'] = str(w2v.vector(x))

        return jsonify(response)
    except Exception as err:
        response['err'] = str(err)
        return jsonify(response)
예제 #18
0
def scrape_webpage(root, dir_, format_, lang="", needs_check=False):
    """
    Navigates to input URL and parses HTML for relative links,
    scraping each page for its text
    """
    text_dict = dict()
    queue = ScrapeQueue(root)

    while len(queue) > 0:
        # Get html data from url on queue
        url = queue.dequeue()

        print(f"Scraping {url}...")
        r = requests.get(url)

        # Log bad request to console
        if not r.status_code == 200:
            print(f"Failed to scrape {url}.\n Status code: {r.status_code}\n")
            continue
        
        # Using lmxl parser and utf-8 to account for various charsets
        soup = BeautifulSoup(r.content, "lxml", from_encoding="utf-8")
        hrefs = yank_hrefs(root, url, soup.find_all("a")) # set instance
        if not hrefs:
            print(f"Error occured in scraping links from {url}.\n\
                    Check status of chromedriver executable and try again.")
            return -2

        # Add found links to queue
        for h in hrefs:
            queue.enqueue(h)

        # Add contents to text_dict
        title = soup.get("title")
        text = soup.get_text(separator=",")        
        if not lang: 
            lang = detect_lang(text) # Set lang

        text_dict[title] = clean_text(text) # Text passed as generator to lower mem load
        print()

    if needs_check:
        return check_spelling(text_dict)

    return save_scrapings(root, dir_, format_, text_dict, lang)
def gposttl(utterance, identifier="no_id"):
    utterance = clean_text(utterance)
    p = subprocess.Popen(['gposttl', '--silent'], stdout=subprocess.PIPE,
                         stdin=subprocess.PIPE)
    (out, error) = p.communicate(utterance.encode('utf8'))
    if p.returncode != '0':
        if len(out) > 1:
            out = out.decode('utf8')
            tokens = [a.replace(u'\t', u'|')
                      for a in out.split(u'\n')
                      if len(a) > 0]
            return tokens
        else:
            raise Exception('Error: no text to parse')
    else:
        msg = str(error)
        code = str(p.returncode)
        sys.stderr.write(utterance+'\n\n')
        raise Exception(': '.join([identifier, code, msg]))
예제 #20
0
def prediction(review):
    global classifier
    global _cv
    global _tfidf
    cleaned_review = util.clean_text(review)
    cleaned_review = [cleaned_review]
    vect = _cv.transform(cleaned_review).toarray()
    rev = _tfidf.transform(vect)

    rev_sparse_tensor = convert_sparse_matrix_to_sparse_tensor(rev)
    ordered_sparse_tensor = tf.compat.v1.sparse.reorder(rev_sparse_tensor)
    my_prediction = classifier.predict(ordered_sparse_tensor)
    predicted_value = my_prediction.item(0)
    print(predicted_value)

    if predicted_value >= 0.5:
        prediction_status = "Great! This is the Positive review."
        print("Hello1")
    elif predicted_value < 0.5:
        prediction_status = "Oops! This is the Negative review."
        print("Hello2")

    return prediction_status
예제 #21
0
def read_data(file_path):
    with open(file_path, "r", encoding="utf8") as inFile:
        lines = inFile.readlines()

    datas = []
    for index, line in enumerate(tqdm(lines, desc="read_data")):
        # 입력 문장을 \t으로 분리
        pieces = line.strip().split("\t")

        # 데이터의 형태가 올바른지 체크
        #assert len(pieces) == 3
        if len(pieces) != 3:
            continue

        if(index == 0):
            continue

        pieces[1] = clean_text(os.path.join(config["root_dir"],"data"), pieces[1])

        id, sequence, label = pieces[0], pieces[1], int(pieces[2])
        datas.append((id, sequence, label))

    return datas
예제 #22
0
def predict():
    params = json.loads(request.data.decode("utf-8"))
    inputs = params['inputs']
    random = True if params.get('random', False) == True or params.get(
        'random', False) == 'true' else False
    temperature = float(params.get('temperature', 1.))
    top_k = int(params.get('topk', 1))
    number = int(params.get('number', 1))
    kind = params.get('kind', 'word')
    if kind == 'para':
        nb_para = number
        nb_sentence = -1
        nb_word = -1
    elif kind == 'sentence':
        nb_para = 1
        nb_sentence = number
        nb_word = -1
    else:
        nb_para = 1
        nb_sentence = -1
        nb_word = number
    config = {
        'inputs': inputs,
        'random': random,
        'temperature': temperature,
        'top_k': top_k,
        'nb_word': nb_word,
        'nb_sentence': nb_sentence,
        'nb_para': nb_para,
    }

    input_words = clean_text(inputs)
    if len(input_words) == 0:
        input_words.append("")
    y = rnn.predict(input_words, config)

    return json.dumps({'config': config, 'output': ' '.join(y)})
예제 #23
0
def de_duplicate(files, output):
    """
    Input a list of JL files, output a single processed JL file
    """
    items = {}
    punct_regex = re.compile('[%s]' % re.escape(string.punctuation))

    for pth in files:
        with open(pth, 'rb') as reader:
            lines_no = 0
            for line in reader:
                try:
                    item = json.loads(line)
                except:
                    continue
                lines_no += 1
                # Fix and normalize text
                item['text'] = clean_text(item['text'])
                # Lower and strip punctuation
                txt = punct_regex.sub('', item['text'].lower())
                # Drop extra spaces
                txt = ' '.join(txt.split())
                if txt in items:
                    # Merge existing tags
                    item['tags'].extend(items[txt]['tags'])
                    item['tags'] = sorted(set(t.lower() for t in item['tags']))
                else:
                    # Normalize tags
                    item['tags'] = sorted(t.lower() for t in item['tags'])
                # The dict KEY will automatically overwrite duplicates
                items[txt] = item
            print(f'Read {lines_no} items from "{pth}".')

    out_items = sorted(items.values(), key=lambda x: x['author'].lower())
    print(f'Written {len(out_items)} items in "{output}".')
    json.dump(out_items, open(output, 'w'))
예제 #24
0
def GetPredictionOnEvalSet(modelfilename,tokenizer):
    global q_max_words,p_max_words,emb_dim
    max_para_length = 362
    max_query_length = 38
    model = load_model('./trained_model/'+modelfilename+'.h5')
    #f = open(testfile,'r',encoding="utf-8")
    all_scores={} # Dictionary with key = query_id and value = array of scores for respective passages
    queryE, paraE= util.loadEvalData()
    if tokenizer==None:
        with open('./data/tokenizer-Stemed2Lqueries.pickle', 'rb') as handle:
            tokenizer=pickle.load(handle)
    keys = list(queryE.keys())
    print(len(keys))
    for i in range(len(keys)):
        Xquery, Xpara = list(), list()
        query_id = keys[i]
        # retrieve query
        query = queryE[query_id]
        query = util.clean_text([query])
        # retrieve text input
        paralist = paraE[query_id]
        cleanparalist = list()
        for j in range(len(paralist)):
            cleanparalist.append(util.clean_text([paralist[j]]))
        # generate input-output pairs
        query_seq = tokenizer.texts_to_sequences([query])[0]
        para_seq = tokenizer.texts_to_sequences(cleanparalist)
        # split one sequence into multiple X,y pairs
        padded_query_seq = pad_sequences([query_seq], maxlen=max_query_length)[0]
        #print(padded_query_seq)
        padded_para_seq = pad_sequences(para_seq, maxlen=max_para_length)
        for k in range(len(paralist)):
            Xquery.append(padded_query_seq)
            Xpara.append(padded_para_seq[k])

        score = model.predict([array(Xquery), array(Xpara)],verbose=0) # do forward-prop on model to get score
        score=score[:,1] # extract 1 column at index 1
        if(query_id in all_scores):
            all_scores[query_id].append(score)
        else:
            all_scores[query_id] = [score]
        text = "\r{0} {1}".format("Done queries: ", i)
        sys.stdout.write(text)
        sys.stdout.flush()


    fw = open("answer.tsv","w",encoding="utf-8")
    for query_id in all_scores:
        scores = all_scores[query_id]
        s=""
        for sc in scores:
            for value in sc:
                value=format(value,'f')#.replace("\n","").replace("[","").replace("]","")
                s=s+ str(value) + "\t"
        s=re.sub(' {2,}', ' ', s)
        s=re.sub(' ', '\t', s)
        fw.write(str(query_id) + "\t" +s.rstrip("\t") +  "\n")
        '''
        scores_str = [str(sc) for sc in scores] # convert all scores to string values
        
        scores_str = "\t".join(scores_str) # join all scores in list to make it one string with  tab delimiter.
        re.sub("[|]", "", scores_str)
        scores_str.replace(" ","\t")
        fw.write(str(query_id)+"\t"+ scores_str + "\n")
        '''
    fw.close()
예제 #25
0
 def get_action(hit):
     address = hit['_source']['address']
     m = re.search(r'.*[\r\n\t\s]+([\w\W]+?)(tel|EIN):', address, re.M)
     address = clean_text(m.group(1))
     return batch.get_update_action(hit, {'address': address})
예제 #26
0
    def test_clean_text(self):
        text = 'æœìíîïýÿòóôõöáâãäëñûüx2,X2'
        cleaned_text = util.clean_text(text)

        true_text = ['aeoeiiiiyyoooooaaaaenuu', ',']
        self.assertEqual(cleaned_text, true_text)
예제 #27
0
from sklearn.decomposition import PCA
import seaborn as sns

sns.set()
sns.set_context('poster')
nltk.download('state_union')
nltk.download('gutenberg')

# Set current corpus (state_union, gutenberg)
corpus = gutenberg

textnames = corpus.fileids()
corpusnames = {gutenberg : 'Gutenberg books', state_union : 'State union documents'}

print("cleaning texts..")
clean_texts = {name : clean_text(corpus.raw(name)) for name in tqdm(textnames)}
print("Calculating frequencies..")
freqs = {name : nltk.FreqDist(text) for name, text in tqdm(clean_texts.items())}

# Create complete vocabulary
wordlist = set()
for freq in freqs.values():
    wordlist.update(freq.keys())
vocabulary = {word : idx for (idx, word) in enumerate(wordlist)}

# Convert texts to wordcount vectors
word_vectors = {name : np.zeros(len(wordlist,)) for name in textnames}
for textname, vector in word_vectors.items():
    for word, freq in freqs[textname].items():
        vector[vocabulary[word]] += freq
예제 #28
0
파일: sandbox.py 프로젝트: BelkacemB/nltk
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import nltk

import util
# Doesn't work very well in french ...
LANGUAGE = "english"

text_file = util.get_text(LANGUAGE)

text_str = text_file.read()
text_str = nltk.word_tokenize(text_str, language=LANGUAGE)

text_str = util.clean_text(text_str, LANGUAGE)

text_tag = nltk.pos_tag(text_str)

nltk_text = nltk.Text(text_str)

print type(nltk_text)

# CHUNKING

sentence = nltk.word_tokenize("Bouteflika is the president of Algeria.")
sentence = nltk.pos_tag(sentence)

# grammar = "Actor: {<DT>?<JJS>*<NNP>+}" # jj  adjectif
# chunk= nltk.RegexpParser(grammar)

# result = chunk.parse(text_tag)
예제 #29
0
       Proceedings of the ACM SIGKDD International Conference on Knowledge 
       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
       Washington, USA, 
"""

import util

negative_stop_words = util.get_file_content_as_list(
    "resources/negative-stop-words.txt")
neutral_stop_words = util.get_file_content_as_list(
    "resources/neutral-stop-words.txt")
positive_stop_words = util.get_file_content_as_list(
    "resources/positive-stop-words.txt")

negative_words = util.get_file_content_as_list("resources/negative-words.txt")
positive_words = util.get_file_content_as_list("resources/positive-words.txt")

original_text = util.format_into_string(
    util.get_file_content_as_list("resources/text.txt"))
text = util.clean_text(util.get_file_content_as_list("resources/text.txt"),
                       neutral_stop_words)

sentiment = util.analyze_text_sentiment(text, negative_stop_words,
                                        positive_stop_words, negative_words,
                                        positive_words)

print("The original text was: " + original_text)
print("\nThe sentiment score was: " + str(sentiment))
print("This corresponds to a " + util.mood(sentiment) + " mood ... " +
      util.emoji(sentiment) + "\n")
예제 #30
0
    "--nb_para",
    default=1,
    type=int,
    help=
    "How many paragraph should it return (default: %(default)s, -1: no limit)")
# parser.add_argument('--use_server', nargs="?", const=True, default=False, type=bool, help='Should use the Server architecture')
args = parser.parse_args()

results_dir = dir + '/results'
rnn_dir = dir + '/' + args.model_dir

config = vars(args)
config['log_dir'] = rnn_dir
config['restore_embedding'] = False
config['seq_length'] = None
input_words = clean_text(config['inputs'])
# if args.use_server is True:
#     with open('clusterSpec.json') as f:
#         clusterSpec = json.load(f)
#     config['target'] = 'grpc://' + clusterSpec['server'][0]
#     pass

rnn = RNN(config)
y = rnn.predict(input_words, config)
print('__BBB_START__')  # Marker for the Regexp used in the App, do not remove
json = json.dumps({
    'config': {
        'inputs': args.inputs,
        'random': args.random,
        'temperature': args.temperature,
        'top_k': args.top_k,