Python tokenize примеры, utils.tokenize Python примеры использования

Пример #1

0

Показать файл

Файл: 0__glove_predict.py Проект: Sirorezka/a-l-l-e-n-_-m-a-s-t-_-r

def predict_answers(data, word2vec, N):

    stop = stopwords.words('english')

    pred_answs = []
    pred_probs = [["A", "B", "C", "D"]]
    for i in range(data.shape[0]):
        #calculate word2vec for question
        q_vec = np.zeros(N, dtype=float)
        for w in tokenize(data['question'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                q_vec = np.add(q_vec, w2)
        q_vec = q_vec / linalg.norm(q_vec)
    
        #calculate word2vec for answers
        A_vec = np.zeros(N, dtype=float)
        B_vec = np.zeros(N, dtype=float)
        C_vec = np.zeros(N, dtype=float)
        D_vec = np.zeros(N, dtype=float)
        for w in tokenize(data['answerA'][i]):
            if w.lower() in word2vec  and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                A_vec = np.add(A_vec,w2)
    
        for w in tokenize(data['answerB'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                B_vec = np.add(B_vec,w2)
            
        for w in tokenize(data['answerC'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                C_vec = np.add(C_vec,w2)

    
        for w in tokenize(data['answerD'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                D_vec = np.add(D_vec,w2)
    
        A_vec = A_vec / linalg.norm(A_vec) 
        B_vec = B_vec / linalg.norm(B_vec)
        C_vec = C_vec / linalg.norm(C_vec)
        D_vec = D_vec / linalg.norm(D_vec)
        
        #choose question based on cosine distance
        idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax()
        probs = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec)
        pred_answs.append(["A", "B", "C", "D"][idx])
        pred_probs.append(probs)
        
    return pred_answs, pred_probs

Пример #2

0

Показать файл

Файл: glove_predict.py Проект: johnkorn/kaggle_allen

def get_glove_features(data, word2vec, N):
    stop = stopwords.words('english')

    scores = []
    for i in range(data.shape[0]):
        #calculate word2vec for question
        q_vec = np.zeros(N)
        for w in tokenize(data['question'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                q_vec += word2vec[w.lower()]
                
#                 # get all synonyms of the word
#                 syns = wn.synsets(w.lower(), pos='n')
#                 if len(syns)>0:
#                     for syn in syns:
#                         sw = syn.lemma_names()[0]
#                         if sw.lower() in word2vec and sw.lower() not in stop:
#                             q_vec += word2vec[sw.lower()]
        
        q_vec = q_vec / linalg.norm(q_vec)
    
        #calculate word2vec for answers
        A_vec = np.zeros(N)
        B_vec = np.zeros(N)
        C_vec = np.zeros(N)
        D_vec = np.zeros(N)
        for w in tokenize(data['answerA'][i]):
            if w.lower() in word2vec  and w.lower() not in stop:
                A_vec += word2vec[w.lower()]
        
    
        for w in tokenize(data['answerB'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                B_vec += word2vec[w.lower()]
        
            
        for w in tokenize(data['answerC'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                C_vec += word2vec[w.lower()]
        
    
        for w in tokenize(data['answerD'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                D_vec += word2vec[w.lower()]
                
    
        A_vec = A_vec / linalg.norm(A_vec) 
        B_vec = B_vec / linalg.norm(B_vec)
        C_vec = C_vec / linalg.norm(C_vec)
        D_vec = D_vec / linalg.norm(D_vec)
                
        scores.append(np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec))
        
    return scores

Пример #3

0

Показать файл

Файл: lesk.py Проект: finiteautomata/wisdom

def __build_dictionary(synset, hyperhypo):
    lesk_dictionary = []
    # Includes definition.
    lesk_dictionary+= tokenize(synset.definition)
    # Includes lemma_names.
    lesk_dictionary+= synset.lemma_names
    # Optional: includes lemma_names of hypernyms and hyponyms.
    if hyperhypo:
        related_senses = synset.hypernyms()+synset.hyponyms()
        for related_sense in related_senses:
            lesk_dictionary+= tokenize(related_sense.definition)
            lesk_dictionary+= [lemma.name for lemma in related_sense.lemmas]

    without_stop_words = filter(lambda word: word not in english_stopwords , lesk_dictionary)
    return map(lambda word: word.lower(), without_stop_words)

Пример #4

0

Показать файл

Файл: execute.py Проект: codekansas/citation-generator

def generate_citations(lines, vocab, index):
    word2idx = dict([(v, k) for k, v in enumerate(vocab)])
    for line in lines[:100]:
        tokenized = list()
        capitalized = list()
        for word, cap in zip(utils.tokenize(line, periods=True), utils.tokenize(line, periods=True, capitalized=True)):
            if word == '.':
                if len(tokenized) > 10:
                    citation = generate_citation([word2idx[w] for w in tokenized if w in word2idx], index)
                    print(' '.join(capitalized) + ' (%s).' % citation)
                tokenized = list()
                capitalized = list()
            else:
                tokenized.append(word)
                capitalized.append(cap)

Пример #5

0

Показать файл

Файл: doc2vecpredict.py Проект: Evanc123/allen_ai

def predict_segmented_tf_idf(data, docs_per_q, ids_and_categories):  
    #index docs
    
    
    res = []
    category_tf_idfs = {}
    for index, row in data.iterrows():


    	current_id = str(row['id'])
    	print current_id
    	current_category = ids_and_categories[current_id]

    	if category_tf_idfs.get(current_category) is None:
    		category_tf_idfs[current_category] = utils.get_docstf_idf(wiki_docs_dir + '/%s' % current_category)

    	docs_tf, words_idf = category_tf_idfs[current_category]

        #get answers words
        w_A = set(utils.tokenize(row['answerA']))
        w_B = set(utils.tokenize(row['answerB']))
        w_C = set(utils.tokenize(row['answerC']))
        w_D = set(utils.tokenize(row['answerD']))
    
        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0
    
        q = row['question']
        
        for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]:
            for w in w_A:
                if w in docs_tf[d]:
                    sc_A += 1. * docs_tf[d][w] * words_idf[w] # count of how many times in the document, times log(numberofdocs/word) for each word
            for w in w_B:
                if w in docs_tf[d]:
                    sc_B += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_C:
                if w in docs_tf[d]:
                    sc_C += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_D:
                if w in docs_tf[d]:
                    sc_D += 1. * docs_tf[d][w] * words_idf[w]

        res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
        
    return res

Пример #6

0

Показать файл

Файл: cli.py Проект: NYTimes/ingredient-phrase-tagger

    def matchUp(self, token, ingredientRow):
        """
        Returns our best guess of the match between the tags and the
        words from the display text.

        This problem is difficult for the following reasons:
            * not all the words in the display name have associated tags
            * the quantity field is stored as a number, but it appears
              as a string in the display name
            * the comment is often a compilation of different comments in
              the display name

        """
        ret = []

        # strip parens from the token, since they often appear in the
        # display_name, but are removed from the comment.
        token = utils.normalizeToken(token)
        decimalToken = self.parseNumbers(token)

        for key, val in ingredientRow.iteritems():
            if isinstance(val, basestring):

                for n, vt in enumerate(utils.tokenize(val)):
                    if utils.normalizeToken(vt) == token:
                        ret.append(key.upper())

            elif decimalToken is not None:
                try:
                    if val == decimalToken:
                        ret.append(key.upper())
                except:
                    pass

        return ret

Пример #7

0

Показать файл

Файл: TestTokenizer.py Проект: engina/jn-cpu

 def testTokens(self):
     tokens = utils.tokenize(self.str3)
     self.assertEqual(11, len(tokens))
     self.assertEqual('\n  two empty spaces and some escaped chars \\\"\\\' in normal textfollowed by a ', tokens[0]['token'])
     self.assertEqual('"dbl quote"', tokens[1]['token'])
     self.assertEqual(' and then a ', tokens[2]['token'])
     self.assertEqual("'single quote'", tokens[3]['token'])
     self.assertEqual('\nwait there is more!! ', tokens[4]['token'])
     self.assertEqual('"\'signle quotes\' inside a double quote"', tokens[5]['token'])
     self.assertEqual(' and ', tokens[6]['token'])
     self.assertEqual('\'"double quotes" inside a single quote\'', tokens[7]['token'])
     self.assertEqual('\nwait! there\\\'s more!! ', tokens[8]['token'])
     self.assertEqual('"escaped double quotes \\" and escaped single quotes\\\' "', tokens[9]['token'])
     self.assertEqual(' ', tokens[10]['token'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[0]['type'])
     self.assertEqual(utils.TOKEN_DBL_Q, tokens[1]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[2]['type'])
     self.assertEqual(utils.TOKEN_SNG_Q, tokens[3]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[4]['type'])
     self.assertEqual(utils.TOKEN_DBL_Q, tokens[5]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[6]['type'])
     self.assertEqual(utils.TOKEN_SNG_Q, tokens[7]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[8]['type'])
     self.assertEqual(utils.TOKEN_DBL_Q, tokens[9]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[10]['type'])

Пример #8

0

Показать файл

Файл: key_word_extractor.py Проект: subhasis256/ml_code_completion

def FrequentWords(data_dirs, suffixes, max_key_words):
  """
  Returns a dictionary of min(max_key_words, percentile_key_words), giving key
  word with its count.
  """
  matches = matchingFiles(data_dirs, suffixes)

  token_count = Counter()
  files_done = 0
  for file_name in matches:
    tokens = tokenize(file_name)
    for token in tokens:
      if len(token) == 0:
        continue
      try:
        token_count[token] += 1
      except:
        token_count[token] = 1
    files_done += 1
    if (files_done % 5000 == 0):
      print("Completed parsing %d files ..." % files_done)

#  num_key_words = min(max_key_words,
#                      math.ceil(percentile_key_words * len(token_count)))
  return token_count.most_common(max_key_words)

Пример #9

0

Показать файл

Файл: taggers.py Проект: attardi/nlpnet

    def tag(self, text=None):
        """
        Tags the given text.
        
        :param text: a string or unicode object. Strings assumed to be utf-8
        :returns: a list of lists (sentences with tokens).
            Each sentence has (token, tag) tuples.
        """
        result = []
        if text:
            tokens = utils.tokenize(text, clean=False)
            for sent in tokens:
                tags = self.tag_tokens(sent)
                result.append(zip(sent, tags))
        else:
            # read tsv from stdin
            sent = []
            for line in sys.stdin:
                line = line.decode('utf-8').strip()
                if line:
                    sent.append(line.split()[0])
                else:
                    tags = self.tag_tokens(sent)
                    result.append(zip(sent, tags))
                    sent = []

        return result

Пример #10

0

Показать файл

Файл: rlogin.py Проект: tegola-hubs/dendria

    def bird_info(self):
        birdv = self.machine.run("echo | birdc | head -1").strip().replace(" ready.", "")
        birdv = birdv.split(" ")
        info = {
            "daemon":  birdv[0],
            "version": birdv[1],
            "ospf": {}
            }

        log.info("[%s] getting OSPF neighbours" % self.hostname())
        output = self.machine.run("echo show ospf neighbors | birdc | sed '/^bird[^ ] .*/d'")
        neighbours = []
        for toks in [tokenize(l) for l in splitlines(output)[2:]]:
            neighbour = {
                "routerid": toks[0]
                }
            if toks[4][0] in ascii_letters:
                neighbour["ifname"] =  toks[4]
                neighbour["v4addr"] =  toks[5]
            else:
                neighbour["v4addr"] =  toks[4]
                neighbour["ifname"] =  toks[5]
            neighbours.append(neighbour)
        info["ospf"]["neighbours"] = neighbours
        return info

Пример #11

0

Показать файл

Файл: cli.py Проект: NYTimes/ingredient-phrase-tagger

    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv(self.opts.data_path)
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start: end]

        for index, row in df_slice.iterrows():
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del(row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i+1, tokens)
                    print utils.joinLine([token] + features + [self.bestTag(tags)])

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print

Пример #12

0

Показать файл

Файл: jc_model.py Проект: Lonesome-George/nlp_project1

 def classify_proba(self, text):
     token_list = tokenize(text)
     token_list = del_stopwords(token_list, self.stopset)
     wordfreq_dict = stat_wordfreq(token_list)
     dictfeats = tfidf(wordfreq_dict, self.idf_dict)
     vecfeats = self.vectorizer.transform(dictfeats).toarray()
     prob = self.classifier.predict_proba(vecfeats)
     return prob[0]

Пример #13

0

Показать файл

Файл: rlogin.py Проект: tegola-hubs/dendria

 def macaddr(self, iface):
     output = self.machine.run("ip link show dev %s | grep link/ether" % iface).strip()
     if not output:
         return None
     mac = tokenize(output)[1].upper()
     if len(mac.replace("0", "").replace(":", "")) == 0:
         return None
     return mac

Пример #14

0

Показать файл

Файл: similar_articles.py Проект: fchantrel/habeascorpus

def find_similar_articles(corpus_name, method, content, data_dir=os.getcwd(), index=None):

    """
    - corpus_name : Le nom du corpus sur lequel on travaille (fichier .tsv 
        sans l'extension .tsv)
        
    - method : ldan (n = le nombre de topics), lsin ou tfidf
    
    - content : un texte
    
    Renvoie les 5 articles de corpus_name les plus proches du contenu spécifié 
    
    """

    corpus_file = os.path.join(data_dir, corpus_name + '_' + method + '.mm')
    index_file = os.path.join(data_dir, corpus_name + '_' + method + '_index')
    docid_file = os.path.join(data_dir, corpus_name + '_docid.txt')
    
    # Chargement du corpus
    try:
        corpus = corpora.mmcorpus.MmCorpus(corpus_file)
    except Exception:
        raise IOError('Impossible de charger le fichier %s. Avez-vous bien appliqué le script corpus_to_matrix.py ?' % (corpus_file))

    # Chargement du fichier d'index, s'il n'est pas fourni en argument
    if not index:
        try:
            index = similarities.docsim.Similarity.load(index_file)
        except Exception:
            raise IOError("""Impossible de charger le fichier %s. Avez-vous bien appliqué le script %s avec l'option --saveindex ?""" % (method, index_file))

    dico_file = os.path.join(data_dir, corpus_name + '_wordids.txt')

    # Chargement du dictionnaire
    try:
        id2word = corpora.dictionary.Dictionary.load_from_text(dico_file)
    except Exception:
        raise IOError("Impossible de charger le fichier %s" % (dico_file))

    # Chargement du modèle correspondant à la méthode voulue par l'utilisateur
    if method == 'tfidf':
        model_file = os.path.join(data_dir, corpus_name + '_tfidf_model')
        model = models.tfidfmodel.TfidfModel.load(model_file)

    elif method.startswith('lsi'):
        model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model')
        model = models.lsimodel.LsiModel.load(model_file)

    elif method.startswith('lda'):
        model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model')
        model = models.ldamodel.LdaModel.load(model_file)

    tokens = model[id2word.doc2bow(utils.tokenize(content))]

    # Renvoi des 5 articles les plus proches 
    sims = index[tokens]   
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return json.dumps([{'id': utils.get_article_by_corpus_number(x[0], docid_file), 'score': round(x[1], 2)} for x in sims[:5]])

Пример #15

0

Показать файл

Файл: extract_features.py Проект: Lonesome-George/nlp_project1

def word_freq(filenames, stopset):
    wordset = set()   # 全部单词集
    freqset_list = [[],[]] # 分别保存负向和正向文本的词频
    npos = 0 # 当前正向文本的数目
    nneg = 0 # 当前负向文本的数目
    icur = 0 # 当前所指向的正向或负向文本的下标
    for filename in filenames:
        fr = file(filename, 'r')
        while True:
            line = fr.readline().decode("utf-8")
            if len(line) == 0: # Zero length indicates EOF
                break
            id,label,text = proc_line(line)
            token_list = tokenize(text)
            token_list = del_stopwords(token_list, stopset)
            wordfreq_dict = {}
            for token in token_list:
                wordset.add(token) # 将单词加入全部单词集
                if wordfreq_dict.has_key(token):
                    wordfreq_dict[token] += 1
                else:
                    wordfreq_dict[token] = 1
            doc = [id, label, wordfreq_dict] # 用列表记录每篇文本的id,label和词频
            # 将文本加入指定列表
            index = 0
            if label == '1':
                index = 1
                freqset_list[1].append(doc)
                icur = npos
                npos += 1
            elif label == '-1':
                index = 0
                freqset_list[0].append(doc)
                icur = nneg
                nneg += 1
            else:
                print 'tag-unknown text'
                continue
        fr.close()
        # 将特征词保存至文件中
        f = open('./Training/WordSet.txt', 'w')
        for word in wordset:
            string = word + '\n'
            f.write(string.encode("utf-8"))
        f.close()
        # 将原始词频保存至文件中
        f = open('./Training/WordFreq_Orig.txt', 'w')
        for i in range(2):
            for freqset in freqset_list[i]:
                id = freqset[0]
                label = freqset[1]
                freq_list = freqset[2]
                string = id + '\t' + label + '\t'
                for word in freq_list:
                    string += word + ',' + str(freq_list[word]) + ';'
                string += '\n'
                f.write(string.encode('utf-8'))
    return wordset, freqset_list

Пример #16

0

Показать файл

Файл: rlogin.py Проект: tegola-hubs/dendria

 def v4addr(self, iface):
     output = self.machine.run("ip addr show dev %s | grep '^ *inet '" % iface).strip()
     def parseaddr(a):
         a = a.strip()
         if "/" not in a:
             return a + "/32"
         return a
     tokset = [tokenize(l) for l in splitlines(output)]
     return [parseaddr(toks[1]) for toks in tokset if len(toks) > 0]

Пример #17

0

Показать файл

Файл: export.py Проект: donvel/affiliations

def find_word_freq(li):
    all_tokens = [normalize(t, lowercase=False)
             for aff in li
             for t in tokenize(text_in_element(aff),
                 split_alphanum=split_alphanum)]
    freq = defaultdict(int)
    for token in all_tokens:
        freq[token] += 1
    return freq

Пример #18

0

Показать файл

Файл: export.py Проект: donvel/affiliations

def dict_from_file(filename, match_case=True):
    d = defaultdict(list)
    with codecs.open(DICTS_DIR + filename, 'rb', encoding='utf8') as f:
        for line in f:
            tokens = tokenize(normalize(line, lowercase=(not match_case)),
                    split_alphanum=split_alphanum)
            for (nb, token) in enumerate(tokens):
                d[token] += [(tokens, nb)]
        return (d, match_case)

Пример #19

0

Показать файл

Файл: glove_predict.py Проект: 5vision/kaggle_allen

def predict_answers(data, word2vec, N):

    stop = stopwords.words('english')

    pred_answs = []
    for i in range(data.shape[0]):
        #calculate word2vec for question
        q_vec = np.zeros(N)
        for w in tokenize(data['question'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                q_vec += word2vec[w.lower()]
        q_vec = q_vec / linalg.norm(q_vec)
    
        #calculate word2vec for answers
        A_vec = np.zeros(N)
        B_vec = np.zeros(N)
        C_vec = np.zeros(N)
        D_vec = np.zeros(N)
        for w in tokenize(data['answerA'][i]):
            if w.lower() in word2vec  and w.lower() not in stop:
                A_vec += word2vec[w.lower()]
    
        for w in tokenize(data['answerB'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                B_vec += word2vec[w.lower()]
            
        for w in tokenize(data['answerC'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                C_vec += word2vec[w.lower()]
    
        for w in tokenize(data['answerD'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                D_vec += word2vec[w.lower()]
    
        A_vec = A_vec / linalg.norm(A_vec) 
        B_vec = B_vec / linalg.norm(B_vec)
        C_vec = C_vec / linalg.norm(C_vec)
        D_vec = D_vec / linalg.norm(D_vec)
        
        #choose question based on cosine distance
        idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax()
        pred_answs.append(["A", "B", "C", "D"][idx])
        
    return pred_answs

Пример #20

0

Показать файл

Файл: build.py Проект: codekansas/citation-generator

def build_vocab(docs, save_as):
    start = time.time()
    vocab = set()
    for file in utils.iterate_corpus(docs):
        with open(file, 'r') as f:
            tokenized = itertools.chain.from_iterable(utils.tokenize(line) for line in f.readlines())
        vocab.update(tokenized)
    vocab = list(vocab)
    pkl.dump(vocab, open(save_as, 'wb'))
    print('Built vocabulary and saved it to "%s" in %s' % (save_as, utils.strtime(time.time() - start)), file=sys.stderr)
    return vocab

Пример #21

0

Показать файл

Файл: ck12_wiki_predict.py Проект: johnkorn/kaggle_allen

def predict(data, docs_per_q):  
    #index docs
    docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir)
    
    res = []
    f = []
    for index, row in data.iterrows():
        #get answers words 
        w_A = set(utils.tokenize(row['answerA']))
        w_B = set(utils.tokenize(row['answerB']))
        w_C = set(utils.tokenize(row['answerC']))
        w_D = set(utils.tokenize(row['answerD']))
    
        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0
    
        q = row['question']
        
        for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]:
            for w in w_A:
                if w in docs_tf[d]:
                    sc_A += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_B:
                if w in docs_tf[d]:
                    sc_B += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_C:
                if w in docs_tf[d]:
                    sc_C += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_D:
                if w in docs_tf[d]:
                    sc_D += 1. * docs_tf[d][w] * words_idf[w]

        res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
        f.append([sc_A, sc_B, sc_C, sc_D])        
     
    features = np.array(f)
    pd.DataFrame({'id': list(data['id']),'fA': features[:,0], 'fB': features[:,1], 'fC': features[:,2], 'fD': features[:,3]})[['id', 'fA', 'fB', 'fC', 'fD']].to_csv('features_ck12.csv', index = False)
    
    return res

Пример #22

0

Показать файл

Файл: models.py Проект: NUKnightLab/neighborhood-buzz

    def __init__(self, data, minimum_vocab_fraction=.02, include_ngrams=True):
        self.doc_freq = FreqDist()
        for count, (label, text) in enumerate(data, start=1):
            for word in set(utils.tokenize(text, include_ngrams, limit_ngrams=True)):
                self.doc_freq.inc(word)
        self.doc_count = count

        self.min_vocab_freq = 1
        self.max_vocab_freq = .95 * self.doc_count
        print 'Min/max vocabulary frequency:', self.min_vocab_freq, self.max_vocab_freq

        self.features = sorted(filter(self._is_valid_feature, self.doc_freq))

Пример #23

0

Показать файл

Файл: load_data.py Проект: vinodrajendran001/ml_research

def load_mol_data(calc_set, opt_set, struct_set, prop_set=None):
    '''
    Load data from data sets and return lists of structure names, full paths
    to the geometry data, the properties, and the meta data.
    '''
    print "Dataset options used"
    print "\tCalculation methods:", calc_set
    print "\tOptimization methods:", opt_set
    print "\tStructure sets:", struct_set
    print "\tProperties:", prop_set
    names = []
    datasets = []
    geom_paths = []
    properties = []
    meta = []
    lengths = []

    for j, base_path in enumerate(opt_set):
        for i, file_path in enumerate(calc_set):
            for m, atom_set in enumerate(struct_set):
                path = os.path.join(DATA_BASE_DIR, "mol_data", base_path, atom_set, file_path)
                with open(path + ".txt", 'r') as f:
                    for line in f:
                        temp = line.split()
                        name, props = temp[0], temp[1:]

                        names.append(name)
                        datasets.append((base_path, file_path, atom_set))

                        geom_path = os.path.join(DATA_BASE_DIR, "mol_data", base_path, 'geoms', 'out', name + '.out')
                        geom_paths.append(geom_path)

                        properties.append([float(x) for x in props])

                        # Add part to feature vector to account for the 4 different data sets.
                        base_part = [i == k for k, x in enumerate(opt_set)]
                        # Add part to feature vector to account for the 3 different methods.
                        method_part = [j == k for k, x in enumerate(calc_set)]
                        # Add part to feature vector to account for the addition of N.
                        atom_part = [m == k for k, x in enumerate(struct_set)]
                        # Add bias feature
                        bias = [1]
                        meta.append(base_part + method_part + atom_part + bias)

                        tokens = tokenize(name, explicit_flips=True)
                        aryl_count = sum([1 for x in tokens if x in ARYL])
                        lengths.append(aryl_count)

    prop_desc = (("H**O", "eV"), ("LUMO", "eV"), ("Excitation", "eV"))
    prop_vals = zip(*properties)
    prop_out = [(x, y, z) for ((x, y), z) in zip(prop_desc, prop_vals)]
    return names, datasets, geom_paths, prop_out, meta, lengths

Пример #24

0

Показать файл

Файл: rlogin.py Проект: tegola-hubs/dendria

    def quagga_info(self):
        output = self.machine.run("zebra --version")
        info = {
            "daemon": "Quagga",
            "version": tokenize(splitlines(output)[0])[-1],
            "ospf": {}
            }

        neighbours = []
        log.info("[%s] getting OSPF neighbours" % self.hostname())
        output = self.machine.run("echo show ip ospf neighbor | vtysh | grep '^[1-9]'")
        for toks in [tokenize(l) for l in splitlines(output)]:
            if len(toks) == 0:
                continue
            neighbour = {
                "routerid": toks[0],
                "v4addr":   toks[4],
                "ifname":   toks[5].split(":")[0]
                }
            neighbours.append(neighbour)
        info["ospf"]["neighbours"] = neighbours
        return info

Пример #25

0

Показать файл

Файл: build.py Проект: codekansas/citation-generator

def build_index(docs, vocab, save_as):
    start = time.time()
    word2idx = dict([(v, k) for k, v in enumerate(vocab)])
    tf = dict([(i, list()) for i in xrange(len(vocab))])
    df = Counter()
    n_docs = len(list(utils.iterate_corpus(docs)))
    files = list()
    for i, file in enumerate(utils.iterate_corpus(docs)):
        print('%d/%d %s' % (i+1, n_docs, utils.strtime(time.time() - start)), file=sys.stderr, end='\r')
        files.append(file)
        with open(file, 'r') as f:
            text = f.read()
            word_counts = Counter(word2idx[w] for w in utils.tokenize(text))
            df.update(word2idx[w] for w in set(utils.tokenize(text)))
            n_words = utils.counter_sum(word_counts)
            for word, count in word_counts.items():
                tf[word].append((count / math.log(n_words), i))
    for word, docs in tf.items():
        docs.sort(key=lambda x: x[0], reverse=True)
    tfidf = tf, df, files
    pkl.dump(tfidf, open(save_as, 'wb'))
    print('Processed %d documents in %s' % (n_docs, utils.strtime(time.time() - start)), file=sys.stderr)
    return tfidf

Пример #26

0

Показать файл

Файл: taggers.py Проект: chrisleewashere/nlpnet

 def tag(self, text):
     """
     Tags the given text.
     
     :param text: a string or unicode object. Strings assumed to be utf-8
     :returns: a list of lists (sentences with tokens). Each sentence has (token, tag) tuples.
     """
     tokens = utils.tokenize(text, clean=False)
     result = []
     for sent in tokens:
         tags = self.tag_tokens(sent)
         result.append(zip(sent, tags))
     
     return result

Пример #27

0

Показать файл

Файл: sentenceClusterer.py Проект: ddeeps2610/Planner

 def clusterSentence(self, sentence):
     """
     clusters the given sentence with existing cluster or creates a
     new cluster.
     sentence - sentence to be clustered
     """
     words = utils.tokenize(sentence.lower())
     lems = utils.lemmatize(words)
     terms = utils.filterStopWords(lems)
     tf = dict(Counter(terms))
     self.clusterize(tf, sentence)  
  
     # Every time a new sentence is clusterized, save latest clusters
     self.saveClusters()

Пример #28

0

Показать файл

Файл: 1_ck12_wiki_predict.py Проект: Sirorezka/a-l-l-e-n-_-m-a-s-t-_-r

def predict(data, docs_per_q):  
    #index docs
    docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir)
    
    res = []
    doc_score = [["A","B","C","D"]]
    for index, row in data.iterrows():
        #get answers words
        w_A = set(utils.tokenize(row['answerA']))
        w_B = set(utils.tokenize(row['answerB']))
        w_C = set(utils.tokenize(row['answerC']))
        w_D = set(utils.tokenize(row['answerD']))
    
        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0
    
        q = row['question']
        
        for d in list(zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q)))[0]:
            for w in w_A:
                if w in docs_tf[d]:
                    sc_A += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_B:
                if w in docs_tf[d]:
                    sc_B += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_C:
                if w in docs_tf[d]:
                    sc_C += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_D:
                if w in docs_tf[d]:
                    sc_D += 1. * docs_tf[d][w] * words_idf[w]

        res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
        doc_score.append([sc_A, sc_B, sc_C, sc_D])
    return res, doc_score

Пример #29

0

Показать файл

Файл: similar_articles.py Проект: lewer/scripts-diplo

def find_similar_articles(corpus_name, method, id=None, content=None):

    corpus_file = corpus_name + '_' + method + '.mm'
    index_file = corpus_name + '_' + method + '_index'
    docid_file = corpus_name + '_docid.txt'

    try:
        corpus = corpora.mmcorpus.MmCorpus(corpus_file)
    except Exception:
        raise IOError('Impossible de charger le fichier %s' % (corpus_file))

    try:
        index = similarities.docsim.Similarity.load(index_file)
    except Exception:
        raise IOError('Impossible de charger le fichier %s' % (index_file))

    if id is not None:  
        corpus_id = utils.get_article_by_id(id, docid_file)
        tokens = corpus[corpus_id]

    elif content is not None:
        dico_file = corpus_name + '_wordids.txt'

        try:
            id2word = corpora.dictionary.Dictionary.load_from_text(dico_file)
        except Exception:
            raise IOError("Impossible de charger le fichier %s" % (dico_file))

        if method == 'tfidf':
            model_file = corpus_name + '_tfidf_model'
            model = models.tfidfmodel.TfidfModel.load(model_file)

        elif method.startswith('lsi'):
            model_file = corpus_name + '_' + args.method + '_model'
            model = models.lsimodel.LsiModel.load(model_file)

        elif method.startswith('lda'):
            model_file = corpus_name + '_' + args.method + '_model'
            model = models.ldamodel.LdaModel.load(model_file)

        tokens = model[id2word.doc2bow(utils.tokenize(content))]

    else:
        raise Exception("Il faut fournir un id ou un contenu")

    sims = index[tokens]   
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    return [(utils.get_article_by_corpus_number(x[0], docid_file), x[1]) for x in sims[:5]]

Пример #30

0

Показать файл

Файл: taggers.py Проект: chrisleewashere/nlpnet

 def tag(self, text, no_repeats=False):
     """
     Runs the SRL process on the given text.
     
     :param text: unicode or str encoded in utf-8.
     :param no_repeats: whether to prevent repeated argument labels
     :returns: a list of SRLAnnotatedSentence objects
     """
     tokens = utils.tokenize(text, clean=False)
     result = []
     for sent in tokens:
         tagged = self.tag_tokens(sent)
         result.append(tagged)
     
     return result

Пример #31

0

Показать файл

from collections import Counter

import nltk
import numpy as np
import pandas

# noinspection PyUnresolvedReferences
from utils import tokenize

# importing corpus as resume
resume_file = open('../assets/resume.txt', 'r')
resume = resume_file.read().lower()
resume_file.close()

# tokenizing the resume
tokens = tokenize(resume)

# dividing corpus into 6 documents
k = len(tokens) // 6
documents = []
for i in range(5):
    documents.append(tokens[i * k:(i + 1) * k])
documents.append(tokens[5 * k:])

# calculating most common 5 tokens from each document and storing frequency tables for each document
most_common = set()
document_frequencies = []
for document in documents:
    frequencies = Counter(document)
    document_frequencies.append(frequencies)
    for word, frequency in frequencies.most_common(5):

Пример #32

0

Показать файл

    def read_training_dataset(self, input_path):
        with open(input_path) as f:

            data = json.load(f)
            self.no_samples = len(data)

            # for padding.
            self.words_converter.T2id('<PAD>')

            self.words_converter.T2id('<SOS>')

            self.slots_converter.T2id('<PAD>')
            self.slots_converter.T2id('<SOS>')

            self.slots_converter.T2id('-')

            for i in tqdm(range(self.no_samples)):

                entry = data[str(i)]

                text = entry["text"]
                text = normalizeString(text)
                tokens = tokenize(text)
                self.stcs_literals.append(tokens)
                tokens_id = [self.words_converter.T2id(id) for id in tokens]
                tokens_id.append(self.words_converter.T2id('</s>'))
                self.stcs.append(tokens_id)
                self.lengths.append(len(tokens_id))

                intent = entry["intent"]

                self.intents.append(self.intent_converter.T2id(intent))

                slots_dictionary = entry["slots"]
                # +1 make room for <SOS>
                slots_id = [self.slots_converter.T2id('-')] * len(tokens_id)
                slots_id[0] = self.slots_converter.T2id('<SOS>')

                no_slots_in_stc = 0
                for slot, target_words in slots_dictionary.items():
                    target_words = normalizeString(target_words)
                    target_word_list = tokenize(target_words)
                    for word in target_word_list:
                        no_slots_in_stc += 1
                        try:
                            idx = tokens.index(word)
                        except:
                            idx = [
                                i for i, s in enumerate(tokens) if word in s
                            ][0]

                        # +1 account for <SOS>
                        slots_id[idx + 1] = self.slots_converter.T2id(slot)

                # keep count of no slots
                for j in range(len(tokens_id) - no_slots_in_stc):
                    self.slots_converter.T2id('-')

                self.slots.append(slots_id)
                # self.slots.append(torch.tensor(slots_id, dtype=torch.long, device=self.device))

            # add padding

            ncols = max(self.lengths)

            self.X = self.stcs
            self.Y = self.slots

Пример #33

0

Показать файл

Файл: generate_wordcloud.py Проект: tomasborrella/disaster-response-pipeline

# import libraries
import pandas as pd
from sqlalchemy import create_engine
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from utils import tokenize


print('Loading data...')
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_table('disaster_message_category', engine)

print('Tokenizing words...')
word_string = " ".join(df['message'])
word_string_final = " ".join(tokenize(word_string))

print('Creating wordcloud...')
wordcloud = WordCloud(width=800,
                      height=400,
                      background_color='white',
                      max_words=300).generate(word_string_final)

print('Generating png image...')
# plot the WordCloud image
plt.figure(figsize=(8, 4), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.savefig('app/static/images/wordcloud.png', dpi=105)

Пример #34

0

Показать файл

Файл: questplusplus.py Проект: zouharvi/ptakopet-server

    def qe(self, sourceLang, targetLang, sourceText, targetText):
        """
        Performs translation quality estimation on sourceText to targetText using QuEst++ and fast_align
        It's ok to raise Exceptions here. They are handled upstream.
        """
        os.makedirs('data/tmp', exist_ok=True)

        if not [sourceLang, targetLang] in self.supportedPairs:
            raise Exception("{}-{} language pair not supported".format(
                sourceLang, targetLang))

        # Sanitize input
        aligned = hunalign(sourceText, targetText)
        sourceText = [tokenize(x[0], sourceLang, False) for x in aligned]
        targetText = [tokenize(x[1], sourceLang, False) for x in aligned]
        sourceTextPlain = '\n'.join([' '.join(x) for x in sourceText])
        targetTextPlain = '\n'.join([' '.join(x) for x in targetText])

        alignments = fast_align.FastAlign().align(sourceLang, targetLang,
                                                  sourceTextPlain,
                                                  targetTextPlain)['alignment']
        with open('data/tmp/alignments', 'w') as fileAlignments:
            fileAlignments.write(alignments)

        with open('data/tmp/source', 'w') as fileSource:
            fileSource.write(sourceTextPlain)

        with open('data/tmp/target', 'w') as fileTarget:
            fileTarget.write(targetTextPlain)

        with DirCrawler('qe/questplusplus'):
            print("Extracting features")
            (_output, _error) = bash("""
                 java -cp QuEst++.jar:lib/* shef.mt.WordLevelFeatureExtractor
                 -lang english spanish
                 -input ../../data/tmp/source ../../data/tmp/target
                 -alignments ../../data/tmp/alignments
                 -config ../questplusplus-config/config.word-level.properties
                 """)

            outputFile = 'output/test/output.txt'
            if not os.path.isfile(outputFile):
                raise Exception('Server Processing Error')
            with open(outputFile, 'r') as outputFileR:
                features = outputFileR.readlines()

        os.remove('data/tmp/alignments')
        os.remove('data/tmp/source')
        os.remove('data/tmp/target')

        features = [[
            x.split('=')[1] for x in line.rstrip('\n').rstrip('\t').split('\t')
        ] for line in features]
        with open('data/tmp/features', 'w') as fileFeatures:
            fileFeatures.write('\n'.join(['\t'.join(x) for x in features]))
        with open('data/tmp/labels', 'w') as fileLabels:
            fileLabels.write('\n'.join(['1'] * len(features)))

        with DirCrawler('qe/questplusplus'):
            print("Removing output directory structure for feature extractor")
            os.remove(outputFile)
            os.rmdir('output/test')
            os.rmdir('output')

            print("Machine Learning")
            (_output, _error) = bash(f"""
                python learning/src/learn_model.py ../questplusplus-config/svr_{sourceLang}_{targetLang}.cfg
                """)

            with open('predicted.csv', 'r') as predictedFile:
                output = [
                    float(x.rstrip('\n').split('\t')[1])
                    for x in predictedFile.readlines()
                ]
            os.remove('predicted.csv')

        os.remove('data/tmp/features')
        os.remove('data/tmp/labels')
        os.rmdir('data/tmp')
        return {'status': 'OK', 'qe': output}

Пример #35

0

Показать файл

Файл: main.py Проект: simtony/Neural-Poetry-Generator

    # Evaluation setup
    'sample': '如',
    'max_sample_length': 50,
    'sample_range':
    2  # how many words in the dictionary to be considered when sampling
}

# -------------------------Data feeding preparation---------------
# Read and tokenize data
texts = [
    './data/qts_tab.txt', './data/qsc_tab.txt', './data/qtais_tab.txt',
    './data/qss_tab.txt'
]
# max and min length of poem sequence
maxlen = 100
minlen = 7
poems = []
# for t in texts:
#     poems.extend(utils.read_poem(t))
for t in texts:
    poems.extend(utils.read_regular_poem(t))

poems = utils.chop_poems(poems, maxlen, minlen)
data, count, dictionary, reverse_dictionary = utils.tokenize(
    poems, params['vocabulary_size'])

rnnlm = language_model.RNNLM(params, data, count, dictionary,
                             reverse_dictionary)
rnnlm.train(sample_interval=100, save_interval=5000, logger=None)
# rnnlm.sample(sample_len=100, checkpoint_dir='./tmp/rnndata/')

Пример #36

0

Показать файл

data = pd.read_csv('../data/data.csv', skiprows=0)
filtered = data[[
    'REGI', 'TYPO', 'VISUAL_SIMILARITY', 'SOUNDEX_DISTANCE'
]][(data['EDIT_DISTANCE'] == 1) & (data['IS_TYPO'] == 1)
   & ((data['VISUAL_SIMILARITY'] >= 0.8) | (data['SOUNDEX_DISTANCE'] <= 1))]
filtered = filtered[filtered.TYPO.map(lambda x: x.count('.')) == 2]
filtered = filtered[filtered.REGI.map(lambda x: x.count('.')) == 2]
filtered.reset_index(drop=True, inplace=True)

reg_list = list()
typo_list = list()
for i in range(t.shape[0]):
    reg_list.append(filtered['REGI'][i].split('.')[0])
    typo_list.append(filtered['TYPO'][i].split('.')[0])

in_list, out_list = utils.tokenize(reg_list, typo_list, token_size)

in_vocab = set()
out_vocab = set()
for name in in_list:
    for char in name:
        in_vocab.add(char)
for name in out_list:
    for char in name:
        out_vocab.add(char)
vocab = in_vocab.union(out_vocab)
num_encoder_tokens = len(in_vocab)
num_decoder_tokens = len(out_vocab)
max_encoder_seq_length = max([len(name) for name in in_list])
max_decoder_seq_length = max([len(name) for name in out_list])

Пример #37

0

Показать файл

Файл: html_to_json.py Проект: catwy/mayors

def html_to_json(url):
    category, uid = tokenize(url)
    schema_name = 'schema/{}.json'.format(category)
    with open(schema_name, 'rb') as fp:
        template = json.load(fp)
    html_doc = get_html(url)
    soup = BeautifulSoup(html_doc, 'html.parser')

    table_title = None
    result = {}
    ignore_image = True
    for tr in soup.find_all('tr'):
        # keep only the most bottom level tr
        if tr.find_all('tr'):
            continue
        is_title_row = False
        row_content = []
        for td in tr.find_all('td'):
            if ignore_image and td.find_all('img'):
                continue
            text = clean_up(td.text)
            if text in template:
                table_title = text
                is_title_row = True
                row_titles = template[table_title]
                ignore_image = row_titles['ignore image']
                result[table_title] = {}
                break
            link = ''
            for a in td.find_all('a'):
                link = a.get('href')
            row_content.append({'text': text, 'link': link})

        if is_title_row:
            continue

        if not row_content or not table_title:
            continue

        column_index = row_titles['column index']
        strict_match = row_titles['strict match']
        regex_match = row_titles['regex match']
        terminate_on_mismatch = row_titles['terminate on mismatch']

        matched = False
        if len(row_content) > column_index + 1:
            candidate_row_title = row_content[column_index]['text']
            for s in strict_match:
                if s == candidate_row_title and s not in result[table_title]:
                    matched = True
                    result[table_title][s] = row_content[column_index + 1:]
                    break
            if not matched:
                for s in regex_match:
                    if s in candidate_row_title:
                        matched = True
                        result[table_title][u'Certified Votes'] = row_content[column_index + 1:]
                        break
                    if re.match(s, candidate_row_title):
                        matched = True
                        category, race_id = tokenize(row_content[column_index + 1]['link'])
                        result[table_title][race_id] = row_content[column_index:]
                        break
        if terminate_on_mismatch and not matched:
            table_title = None
            ignore_image = True
    return result

Пример #38

0

Показать файл

Файл: construct_vocab.py Проект: cgl/turkish-parliament-texts

    args = parser.parse_args()

    logging.basicConfig(filename=args.log_filepath,
                        format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    if args.command == "construct_vocab":

        dictionary = corpora.Dictionary()

        count = 0

        line = sys.stdin.readline()
        while line:

            tokens = tokenize(line)

            dictionary.add_documents([tokens], prune_at=None)
            count += 1

            if count % 100000 == 0:
                print_err("line %d %d" % (count, len(dictionary)))

            line = sys.stdin.readline()

        dictionary.save(args.vocabulary_filename)
        dictionary.save_as_text(args.vocabulary_filename + ".txt")

    elif args.command == "construct_corpus":
        # use glob to recurse under data/TXTs directory

Пример #39

0

Показать файл

Файл: convert_utils.py Проект: amazingguni/commits-dataset

def convert_filename(filename):
    tokenized_filename = tokenize(filename.replace('/', '.'))
    return f'{constants.FILE_START} {tokenized_filename} {constants.FILE_END}\n'

Пример #40

0

Показать файл

Файл: semcor.py Проект: aseeng/Semcor

from nltk.corpus import semcor
import utils

count = 0
num_sentences = 0
for i in range(100):
    sent = semcor.xml('brown2/tagfiles/br-n12.xml').findall('context/p/s')[i]

    sentence = ""
    name = ""

    for wordform in sent.getchildren():
        sentence += wordform.text + " "
        if wordform.get('pos') == "NN" and wordform.text != "anyone":
            name = wordform.text
            sense_key = wordform.get('lexsn')

    context = utils.tokenize(sentence)

    if name is not "":
        best_sense = utils.find_synset(context,name)
        num_sentences += 1
        if sense_key == best_sense.lemmas()[0].key()[-9:]:
            count += 1
    if num_sentences == 50:
        break

print("accuracy = " + str(count*100/num_sentences) + " %")

Пример #41

0

Показать файл

Файл: train_val.py Проект: Niksv98/keras_speller_eng

# http://arxiv.org/abs/1410.4615
# "Sequence to Sequence Learning with Neural Networks"
# https://arxiv.org/abs/1409.3215
reverse = True

data_path = './data'
train_books = [
    'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt',
    'war_and_peace.txt', 'botanical_2.txt', 'botanical_terms.txt'
]
val_books = ['wonderland.txt', 'botanical_1.txt']

if __name__ == '__main__':
    # Prepare training data.
    text = read_text(data_path, train_books)
    vocab = tokenize(text)
    vocab = list(filter(None, set(vocab)))

    # `maxlen` is the length of the longest word in the vocabulary
    # plus two SOS and EOS characters.
    maxlen = max([len(token) for token in vocab]) + 2
    train_encoder, train_decoder, train_target = transform(
        vocab, maxlen, error_rate=error_rate, shuffle=False)
    print(train_encoder[:10])
    print(train_decoder[:10])
    print(train_target[:10])

    input_chars = set(' '.join(train_encoder))
    target_chars = set(' '.join(train_decoder))
    nb_input_chars = len(input_chars)
    nb_target_chars = len(target_chars)

Пример #42

0

Показать файл

Файл: russian.rt.com.py Проект: fostroll/ru_corner

        #                                  .replace('ё', 'ё') \
        #                                  .strip()
        line = utils.norm_text2(re2.sub('', line))
        if line:
            lines.append(' '.join(line.split()))
    if len(lines) >= _utils.MIN_TEXT_LINES:
        texts_total += 1
        if link_no > start_link_idx:
            with open(page_fn, 'wt', encoding='utf-8') as f:
                print(link, file=f)
                f.write(page)
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            print(header, file=f)
            f.write('\n'.join(lines))
        print('\r{} (of {})'.format(texts_total,
                                    min(utils.TEXTS_FOR_SOURCE, num_links)),
              end='')
        need_enter = True
    #exit()
if need_enter:
    print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(num_links)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(num_links, isdialog=False)

Пример #43

0

Показать файл

def generate_repo_dataset(fullname, branch, sha_list, repo_dir, writer):
    repo = Repo(repo_dir)
    total_cnt, current_cnt, msg_skip, diff_skip, word_skip = 0, 0, 0, 0, 0
    index_list, origin_target_list, target_list, origin_line_list, line_list, origin_word_list, word_list = [],[],[],[],[],[],[]
    for sha in sha_list:
        commit = repo.commit(sha)
        total_cnt += 1
        commit_msg = commit.message
        sentences = split_sentence(commit_msg)
        if not sentences:
            continue
        commit_msg = sentences[0].strip()
        commit_msg_lower = commit_msg.lower()
        if 'revert' in commit_msg_lower or commit_msg_lower.startswith('merge '):
            msg_skip += 1
            continue
        commit_msg = remove_redundant_white_space(commit_msg.strip())
        origin_commit_msg = commit_msg
        if not commit_msg:
            msg_skip += 1
            continue
        
        commit_msg = tokenize(commit_msg)
        commit_msg = remove_last_special_char(commit_msg.strip())
        commit_msg = remove_no_english_str(commit_msg)
        commit_msg = remove_redundant_white_space(commit_msg.strip())
        commit_msg = commit_msg.strip()
        if not commit_msg:
            msg_skip += 1
            continue
        commit_words = commit_msg.split()
        # if not starts_with_verb(commit_words):
        #     msg_skip += 1
        #     continue
        if len(commit_words) > constants.TARGET_SEQ_LEN_MAX:
            msg_skip += 1
            continue
            
        line_diff = get_line_diff(repo_dir, sha)
        if not line_diff:
            diff_skip += 1
            continue
        origin_line_diff = line_diff
        line_diff = remove_no_english_str(line_diff)
        line_diff = remove_redundant_white_space(line_diff.strip())
        line_diff_words = line_diff.split()
        if not overlap_two_seq(line_diff_words, commit_words):
            diff_skip+=1
            continue

        if len(line_diff_words) > constants.SOURCE_SEQ_LEN_MAX:
            diff_skip+=1
            continue
        
        word_diff = get_word_diff(repo_dir, sha)
        if not word_diff:
            word_skip += 1
            continue
        origin_word_diff = word_diff
        word_diff = remove_no_english_str(word_diff)
        word_diff = remove_redundant_white_space(word_diff.strip())
        if not word_diff:
            word_skip += 1
            continue
        word_diff_words = word_diff.split()
        index = f'{fullname} {sha}'
        writer.write(index, origin_commit_msg, commit_msg, origin_line_diff, line_diff, origin_word_diff, word_diff)
        current_cnt+=1
    print(f'{fullname}:  {current_cnt}/{total_cnt}')
    return current_cnt

Пример #44

0

Показать файл

def search(*arguments):
    print("Loading Files")
    outfile = open("./query_op.txt", 'w')
    with open(arguments[0], 'r') as f:
        queries = f.readlines()
    with open("./inverted_index/titleOffset.txt", 'r') as f:
        titleOffSet = [int(line.strip()) for line in f]
    with open("./inverted_index/offset.txt", 'r') as f:
        offset = []
        for line in f.readlines():
            try:
                offset.append(int(line.strip()))
            except BaseException:
                continue
    vocabFile = open("./inverted_index/vocab.txt", 'r')
    titleFile = open("./inverted_index/title.txt", 'r')
    with open("./inverted_index/fileNumbers.txt", 'r') as f:
        nFiles = int(f.read().strip())
    key_words = ['t:', 'b:', 'i:', 'c:', 'r:', 'l:']
    print("Starting Queries")
    numQueries = 0
    for query in queries:
        startTime = time.time()
        numQueries += 1
        query = query.strip().lower()
        numResults, query = query.split(",")
        query = query.strip()
        numResults = int(numResults)
        queryType = "Plain"
        for w in key_words:
            if w in query:
                queryType = "Field"
                break

        if queryType == "Field":
            q = re.split("(t:)|(b:)|(i:)|(c:)|(r:)|(l:)", query)
            q = [i.strip() for i in q if i is not None and i != ""]
            queryDict = defaultdict(list)
            for idx in range(0, len(q), 2):
                data = tokenize(q[idx + 1].lower())
                data = [w for w in data if w not in stopWords]
                data = stemmer.stemWords(data)
                queryDict[q[idx].split(":")[0]].extend(data)
            results, docFreq = fieldQuery(queryDict, vocabFile, offset)
            results = rank(results, docFreq, nFiles)
        else:
            q = tokenize(query)
            q = [w for w in q if w not in stopWords]
            q = stemmer.stemWords(q)
            t = simpleQuery(q, vocabFile, offset)
            results, docFreq = t[0], t[1]
            results = rank(results, docFreq, nFiles)

        if len(results) > 0:
            results = sorted(results, key=results.get, reverse=True)
            results = results[:numResults]
            for key in results:
                title, _ = fileBinarySearch(
                    0, len(titleOffSet), titleOffSet, key, titleFile, 'int')
                print(','.join([key] + [' '.join(title)]), file=outfile)
        endTime = time.time()
        print(
            "{0}, {1}".format(
                endTime - startTime,
                (endTime - startTime) / numResults),
            file=outfile)

        print('\n', file=outfile)
    outfile.close()

Пример #45

0

Показать файл

                text = None
            break
        if not res:
            if not SILENT:
                if not text:
                    print('no text')
                    #if nop:
                    #    exit()
                else:
                    print('text beyond limits:')
                    print(text)
            continue
        texts_total += 1
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            f.write(text)
        print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE),
              end='')
        need_enter = True
        #exit()
    if need_enter:
        print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(utils.TEXTS_FOR_SOURCE)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(utils.TEXTS_FOR_SOURCE, isdialog=False)

Пример #46

0

Показать файл

Файл: run.py Проект: khushmeeet/code-rnn

            model.cuda()

        gen_text = generation(embedding, model, state, options.n,
                              options.primer)
        print(gen_text)
    else:
        lr = model_settings['learning_rate']
        layers = model_settings['layers']
        batch_size = model_settings['batch_size']
        rnn_size = model_settings['rnn_size']
        embed_size = model_settings['embed_size']
        seq_length = model_settings['seq_length']
        dropout = model_settings['dropout']
        data_size = 256  # ???

        train_x = utils.tokenize(options.train_data)
        train_x = utils.batchify(train_x, batch_size)
        num_batches = train_x.size(0) // seq_length

        if len(options.load_model) > 0:
            checkpoint = torch.load(options.load_model)
            embedding = checkpoint['embed']
            model = checkpoint['rnn']
        else:
            embedding = nn.Embedding(256, embed_size)
            model = Stacked_mLSTM(mLSTM, layers, embed_size, rnn_size,
                                  data_size, dropout)

        loss_fn = nn.CrossEntropyLoss()
        embed_optimizer = optim.Adam(embedding.parameters(), lr=lr)
        model_optimizer = optim.Adam(model.parameters(), lr=lr)

Пример #47

0

Показать файл

Файл: interactive-eval.py Проект: schang8000/multiffn-nli

    logger = utils.get_logger()

    logger.info('Reading model')
    sess = tf.InteractiveSession()
    model = multimlp.MultiFeedForward.load(args.load, sess)
    word_dict, embeddings = readdata.load_embeddings(args.embeddings, args.vocab,
                                                     generate=False,
                                                     load_extra_from=args.load)
    embeddings = utils.normalize_embeddings(embeddings)
    model.initialize_embeddings(sess, embeddings)
    number_to_label = {v: k for (k, v) in utils.label_map.items()}

    while True:
        sent1 = raw_input('Type sentence 1: ')
        sent2 = raw_input('Type sentence 2: ')
        tokens1 = utils.tokenize(sent1)
        tokens2 = utils.tokenize(sent2)
        vector1 = convert_tokens(tokens1, word_dict, model.max_time_steps1)
        vector2 = convert_tokens(tokens2, word_dict, model.max_time_steps2,
                                 prepend=word_dict[utils.GO])

        feeds = {model.sentence1: vector1,
                 model.sentence2: vector2,
                 model.sentence1_size: [len(tokens1)],
                 model.sentence2_size: [len(tokens2)+1],
                 model.dropout_keep: 1.0}

        answer = sess.run(model.answer, feed_dict=feeds)
        print('Model answer:', number_to_label[answer[0]])

        print()

Пример #48

0

Показать файл

Файл: words_count.py Проект: mpipet/words_count

#!/usr/bin/python

from utils import tokenize, stdin

words_count = {}
for line in stdin():
    for word in tokenize(line, [' ', '\t', '-']):
        words_count[word] = words_count.get(word, 0) + 1

sorted_words_count = sorted(words_count.items(),
                            reverse=True,
                            key=lambda tup: tup[1])

for word in sorted_words_count:
    print("%i %s" % (word[1], word[0]))

Пример #49

0

Показать файл

from model import NerModel
import tensorflow_addons as tf_ad
import os
import numpy as np
from args_help import args
from my_log import logger

if not (os.path.exists(args.vocab_file) and os.path.exists(args.tag_file)):
    logger.info("building vocab file")
    build_vocab([args.train_path], args.vocab_file, args.tag_file)
else:
    logger.info("vocab file exits!!")

vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
text_sequences, label_sequences = tokenize(args.train_path, vocab2id, tag2id)

train_dataset = tf.data.Dataset.from_tensor_slices(
    (text_sequences, label_sequences))
train_dataset = train_dataset.shuffle(len(text_sequences)).batch(
    args.batch_size, drop_remainder=True)

logger.info("hidden_num:{}, vocab_size:{}, label_size:{}".format(
    args.hidden_num, len(vocab2id), len(tag2id)))
model = NerModel(hidden_num=args.hidden_num,
                 vocab_size=len(vocab2id),
                 label_size=len(tag2id),
                 embedding_size=args.embedding_size)
optimizer = tf.keras.optimizers.Adam(args.lr)

ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)

Пример #50

0

Показать файл

    # for tweet in tokenized_tweets:
    #     tweets.append(tweet['clean'])
    #     labels.append(tweet['class'])

    # train = pd.read_csv("../Data/imdb/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
    train = pd.read_csv("../Data/imdb/train.tsv",
                        header=0,
                        delimiter="\t",
                        quoting=3)
    # test = pd.read_csv("../Data/imdb/testData.tsv", header=0, delimiter="\t", quoting=3)

    tokenized_train = []

    for idx, text in train.iterrows():
        # tokenized_train.append(ut.tokenize(text['review'], text['sentiment'])) # for labeledTrainData.tsv
        tokenized_train.append(ut.tokenize(text['Phrase'],
                                           text['Sentiment']))  # for train.tsv

    tweets = []
    labels = []
    for tweet in tokenized_train:
        tweets.append(tweet['clean'])
        labels.append(tweet['class'])

    partition = 5
    train_tweets, test_tweets, train_labels, test_labels = ut.crossValidation2(
        tweets, labels, partition)

    # kf = cv.KFold(n=len(tweets), n_folds=3, shuffle=True, indices=False)

    accuracyLR, precisionLR, recallLR, f_measureLR = [], [], [], []
    accuracyRF, precisionRF, recallRF, f_measureRF = [], [], [], []

Пример #51

0

Показать файл

from torch.utils.data import Dataset, DataLoader
from model import NeuralNet

with open('intents.json','r') as f:
    intents = json.load(f)

# print(intents) 
all_words = []
tags = []
xy = []

for intent in intents['intent']:
    tag = intents['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        # use extend instead of append as we don;t want array of arrays
        xy.append((w,tag))

ignore_words = ['?','!','[',']','.',',']
all_words = [stem(w) for w in all_words if w not in ignore_words]

all_words = sorted(set(all_words))
tags = sorted(set(tags))

X_train = []
y_train = []

for (sen, tag) in xy:
    bag = bow(sen,all_words)

Пример #52

0

Показать файл

Файл: preprocess_questions.py Проект: GeraldHan/TRN

def main(args):

    nlp = spacy.load('en')
    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'],
                                            add_special=True)
        # all_program_strs = []
        # for q in questions:
        #   if 'program' not in q: continue
        #   program_str = program_to_strs(q['program'], args.mode)[0]
        #   if program_str is not None:
        #     all_program_strs.append(program_str)
        # program_token_to_idx = build_vocab(all_program_strs, add_special=True)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            # 'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,  # no special tokens
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f, indent=4)

    # Encode all questions and entities
    print('Encoding data')
    questions_encoded = []
    orig_idxs = []
    image_idxs = []
    answers = []
    questions_len = []
    questions_mask = []
    noun_chunk_starts = []
    noun_chunk_ends = []
    entity_masks = []
    max_entity_length = 5

    for orig_idx, q in enumerate(questions):
        question = q['question'].replace('?', '').replace('.', '').replace(
            ';', ' ;').replace(',', ' ,')

        doc = nlp(question)
        start, end = find_noun_chunks(doc)
        noun_chunk_starts.append(start[:max_entity_length])
        noun_chunk_ends.append(end[:max_entity_length])

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        question_tokens = tokenize(question)

        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)

        questions_encoded.append(question_encoded)
        questions_len.append(len(question_encoded))

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])
        else:
            answers.append(-1)

    # Pad encoded questions and entities
    max_question_length = max(len(x) for x in questions_encoded)

    for st, ed, qe in zip(noun_chunk_starts, noun_chunk_ends,
                          questions_encoded):
        entity_masks.append(
            (np.arange(max_entity_length) < len(st)).astype(int))
        if len(st) < max_entity_length:
            # qe.append(vocab['question_token_to_idx']['<NULL>'])
            padding = [len(qe) - 1] * (max_entity_length - len(st))
            st += padding

        if len(ed) < max_entity_length:
            # qe.append(vocab['question_token_to_idx']['<NULL>'])
            padding = [len(qe)] * (max_entity_length - len(ed))
            ed += padding

        questions_mask.append(
            (np.arange(max_question_length) < len(qe)).astype(int))
        if len(qe) < max_question_length:
            # qe.append(vocab['question_token_to_idx']['<NULL>'])
            padding = [vocab['question_token_to_idx']['<NULL>']
                       ] * (max_question_length - len(qe))
            qe += padding
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    questions_len = np.asarray(questions_len, dtype=np.int32)
    print(questions_encoded.shape)

    entity_starts = np.asarray(noun_chunk_starts, dtype=np.int32)
    entity_ends = np.asarray(noun_chunk_ends, dtype=np.int32)
    print(entity_starts.shape)

    print('Writing')
    obj = {
        'questions': questions_encoded,
        'image_idxs': np.asarray(image_idxs),
        'orig_idxs': np.asarray(orig_idxs),
        # 'programs': programs_encoded,
        # 'program_inputs': program_inputs_encoded,
        'answers': answers,
        'questions_len': questions_len,
        'questions_mask': questions_mask,
        'e_starts': entity_starts,
        'e_ends': entity_ends,
        'e_masks': entity_masks
    }
    with open(args.output_pt_file, 'wb') as f:
        pickle.dump(obj, f)

Пример #53

0

Показать файл

all_words = data["all_words"]
tags = data["tags"]
model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

bot_name = "Bryant's Coffee shop"
print('Type quit to exit')

while True:
    sentence = input("You: ")
    if sentence == "quit":
        break
    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(-1, X.shape[0])
    X = torch.from_numpy(X)

    output = model(X)
    # print(output)
    _, predicted = torch.max(output, dim=1)
    tag = tags[predicted.item()]
    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]

    if prob.item() > 0.75:
        for intent in intents["intents"]:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")

Пример #54

0

Показать файл

Файл: predict.py Проект: lixuanhng/NLP_related_projects

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from model import NerModel
from utils import tokenize, read_vocab, format_result, build_embedding_matrix
import tensorflow_addons as tf_ad
from args_help import args
import json
import numpy as np

# 针对测试集完成词表字典，标签字典，文本序列长度和初始化词向量
vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
print(id2tag)
text_sequences, label_sequences, text_origin, label_origin = tokenize(
    args.test_path, vocab2id, tag2id)
# text_sequences 的维度是（159，110）
embedded_matrix = build_embedding_matrix(args.pretrain_embedding_vec, vocab2id)

# print('查看 text_sequences 的值和维度:')
# print(text_sequences.shape)
# print(type(text_sequences))

# 载入模型
optimizer = tf.keras.optimizers.Adam(args.lr)
model = NerModel(hidden_num=args.hidden_num,
                 vocab_size=len(vocab2id),
                 label_size=len(tag2id),
                 embedding_size=args.embedding_size,
                 embedding_matrix=embedded_matrix)
# restore model

Пример #55

0

Показать файл

def main(args):
    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'],
                                            add_special=True)
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_strs(q['program'], args.mode)[0]
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs, add_special=True)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,  # no special tokens
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f, indent=4)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    # value_inputs, encoded by question_token_to_idx in CLEVR
    # because all valid inputs are in question vocab
    program_inputs_encoded = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str, input_str = program_to_strs(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)
            # program value_inputs
            input_tokens = tokenize(input_str)
            input_encoded = encode(input_tokens,
                                   vocab['question_token_to_idx'])
            assert len(input_encoded) == len(
                program_encoded)  # input should have the same len with func
            program_inputs_encoded.append(input_encoded)
        else:
            programs_encoded.append([-1])
            program_inputs_encoded.append([-1])

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])
        else:
            answers.append(-1)

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])
        for ie in program_inputs_encoded:
            while len(ie) < max_program_length:
                ie.append(vocab['question_token_to_idx']['<NULL>'])

    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    program_inputs_encoded = np.asarray(program_inputs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    print(program_inputs_encoded.shape)
    print('Writing')
    obj = {
        'questions': questions_encoded,
        'image_idxs': np.asarray(image_idxs),
        'orig_idxs': np.asarray(orig_idxs),
        'programs': programs_encoded,
        'program_inputs': program_inputs_encoded,
        'answers': answers,
    }
    with open(args.output_pt_file, 'wb') as f:
        pickle.dump(obj, f)

Пример #56

0

Показать файл

                        default='dictionary.pkl',
                        type=str,
                        help='path to the dictionary')

    args = parser.parse_args()

    # Turns on logging.
    import logging
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    dictionary, rev_dict = utils.get_dictionary(args.text, args.dictionary)
    num_classes = len(dictionary)

    iterator = utils.tokenize(args.text,
                              dictionary,
                              batch_size=args.batch_size,
                              seq_len=args.seq_len)

    sess = tf.Session()
    model = SeqGAN(sess,
                   num_classes,
                   logdir=args.logdir,
                   learn_phase=args.learn_phase,
                   only_cpu=args.only_cpu)
    model.build()
    model.load(ignore_missing=True)

    for epoch in range(1, args.num_epochs + 1):
        for step in range(1, args.num_steps + 1):
            logging.info('epoch %d, step %d', epoch, step)
            model.train_batch(next(iterator))

Пример #57

0

Показать файл

def add_code_into_document(document, body):
    asts, code_hints = transform_body(body)

    flag = False

    #typed_method_call = set()
    for ast in asts:
        for mc in ast["typed_method_call"]:
            if mc:
                document.add(
                    Field("typed_method_call", mc, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        for e in ast["extends"]:
            if e:
                document.add(
                    Field("extends", e, Field.Store.YES, Field.Index.ANALYZED))

        for c in ast["used_classes"]:
            if c:
                document.add(
                    Field("used_classes", c, Field.Store.YES,
                          Field.Index.ANALYZED))

        for m in ast["methods"]:
            if m:
                document.add(
                    Field("methods", m, Field.Store.YES, Field.Index.ANALYZED))
                flag = True

        for m in ast["methods_called"]:
            if m:
                document.add(
                    Field("methods_called", m, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        #comment
        if "comments" in ast:
            for c in ast["comments"]:
                document.add(
                    Field("comments", utils.unescape_html(c), Field.Store.NO,
                          Field.Index.ANALYZED))

        for i in ast["class_instance_creation"]:
            if i:
                document.add(
                    Field("class_instance_creation", i, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        for l in ast["literals"]:
            if l:
                document.add(StringField("literals", l, Field.Store.YES))

        #finally all the splitted words
        # for s in camel_case:
        # 	document.add( Field("camel_case_words", s.lower(), Field.Store.NO, Field.Index.NOT_ANALYZED))

    hints = []
    for h in code_hints:
        for token in utils.tokenize(h):
            if 1 < len(token) < 20:
                hints.append(token)

    for hint in set(hints):
        document.add(
            Field("code_hints", hint, Field.Store.YES, Field.Index.ANALYZED))

    return flag

Пример #58

0

Показать файл

Файл: evaluate.py Проект: shivam13juna/seq2seq_spelling_correction

error_rate = 0.6
reverse = True
model_path = './models/seq2seq.h5'
hidden_size = 512
sample_mode = 'argmax'
data_path = './data'
books = [
    'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt',
    'war_and_peace.txt'
]

test_sentence = 'The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.'

if __name__ == '__main__':
    text = read_text(data_path, books)
    vocab = tokenize(text)
    vocab = list(filter(None, set(vocab)))
    # `maxlen` is the length of the longest word in the vocabulary
    # plus two SOS and EOS characters.
    maxlen = max([len(token) for token in vocab]) + 2
    train_encoder, train_decoder, train_target = transform(
        vocab, maxlen, error_rate=error_rate, shuffle=False)

    tokens = tokenize(test_sentence)
    tokens = list(filter(None, tokens))
    nb_tokens = len(tokens)
    misspelled_tokens, _, target_tokens = transform(tokens,
                                                    maxlen,
                                                    error_rate=error_rate,
                                                    shuffle=False)

Пример #59

0

Показать файл

Файл: vocab.py Проект: davidsvaughn/dts-tf

 def tokenize(line, lower=True, flat=False, clean=True):
     if clean: line = Vocab.clean_line(line)
     toks = U.tokenize(line, lower=lower, flat=flat)
     return toks

Пример #60

0

Показать файл

    # choose the first half of files based on a deterministic random range
    robj = random.Random(12345)
    robj.shuffle(files)

    if args.command == 'train':
        fileSubset = files[:len(files) / 2]
    elif args.command == 'test':
        fileSubset = files[len(files) / 2:]
    else:
        fileSubset = files[len(files) / 2:]

    if args.command == 'train' or args.command == 'test':
        for i, name in enumerate(fileSubset):
            if i % 1000 == 0:
                print '%d files done' % i
            filesAndTokens.append((name, utils.tokenize(name)))
        print len(filesAndTokens)
        print sum([len(tokens) for name, tokens in filesAndTokens])

#    model = PositionDependentVectorModel(keywords, winSize=args.win,
#                                         wdim=args.dim, stepsize=args.lr,
#                                         reg=args.reg)
#    model = ConstantAttentionVectorModel(keywords, winSize=args.win,
#                                         wdim=args.dim, stepsize=args.lr,
#                                         reg=args.reg)
#    model = NonLinearVectorModel(keywords, winSize=args.win,
#                                 wdim=args.dim, zdim=args.zdim,
#                                 stepsize=args.lr,
#                                 reg=args.reg)
#    model = RnnDense(keywords, winSize=args.win,
#                    wdim=args.dim, zdim=args.zdim,

Python tokenize примеры использования