def ngram_mro(self): graph = NNCommonManager().get_nn_node_name(self.nn_id) for net in graph: if net['fields']['graph_node'] == 'netconf_data': data_node = net['fields']['graph_node_name'] elif net['fields']['graph_node'] == 'netconf_node': net_node = net['fields']['graph_node_name'] netconf = NNCommonManager().get_nn_node_info(self.nn_id, str( self.ver), net_node)[0]['fields']['node_config_data'] self.param['list'] = [] self.param['standard'] = float(netconf['standard']) self.param['datatype'] = netconf['datatype'] self.param['conninfo'] = netconf['conninfo'] if self.param['datatype'] == 'file': self.get_file_data(data_node) elif self.param['datatype'] == 'db': self.get_db_data() item = [] for val in self.param['list']: try: item_tuple = (val['item_code'].strip(), val['item_leaf'].strip(), val['item_desc'].strip()) item.append(item_tuple) except: logging.info('Error Data' + val['item_code']) dataset = ngram.NGram(item, key=lambda x: x[2]) dataset = sorted(dataset, key=lambda x: x[0]) findset = ngram.NGram(item, key=lambda x: x[2]) logging.info( '================================================================================================' ) return_data = {} for data in dataset: findset.remove(data) result = findset.search(data[2], self.param['standard']) for r in range(len(result)): if return_data.get(data[0]) == None: return_data[data[0]] = {} return_data[data[0]]['desc'] = data[2] # logging.info(str(data[0]) + ':' + str(data[2])) return_data[data[0]][result[r][0][0]] = { 'item_desc': result[r][0][2], 'item_perc': result[r][1] } # logging.info(' - '+str(result[r][0][0])+'('+str(result[r][1])+')' + ':' + str(result[r][0][2])) logging.info( str(data[0]) + '-' + str(result[r][0][0]) + '(' + str(result[r][1]) + ')') return return_data
def main(): words = get_words('E:/DataSet/NLP/中文分词语料(山西大学提供)/' + '训练语料(528250词,Unicode格式).txt') seg_trian = get_seg_sentences('E:/DataSet/NLP/中文分词语料(山西大学提供)/' + '训练语料(528250词,Unicode格式).txt') seg_test = get_sentences('E:/DataSet/NLP/中文分词语料(山西大学提供)/' + '测试语料(Unicode格式).txt') seg_test_answer = get_seg_sentences('E:/DataSet/NLP/中文分词语料(山西大学提供)/' + '测试语料答案(Unicode格式).txt') pos_train = get_pos_sentences( 'E:/DataSet/NLP/人民日报语料199801/' + '199801.txt', 'ansi') pos_test = [] # 词性标注里面的句子同时也是分词的训练集 for data in pos_train: pos_test.append([dat[0] for dat in data]) seg_trian.append([dat[0] for dat in data]) for dat in data: words.add(dat[0]) print('forward maximum match:') # 正向最大匹配分词 print_ratio(seg_test_answer, mm.fmm(words, seg_test)) print('\nbackward maximum match:') # 反向最大匹配分词 print_ratio(seg_test_answer, mm.bmm(words, seg_test)) print('\nshortest path segementation:') # 最短路径分词 print_ratio(seg_test_answer, sp.divide(words, seg_test)) print('\n2-gram:') # 2元文法分词以及词性标注 gram2 = ngram.NGram(seg_trian, pos_train, n=2) print_ratio(seg_test_answer, gram2.seg(seg_test)) print('pos correct ratio:\t', get_pos_correct_ratio(pos_train[:200], gram2.pos(pos_test[:300]))) print('\n3-gram:') # 3元文法分词以及词性标注 gram3 = ngram.NGram(seg_trian, pos_train, n=3) print_ratio(seg_test_answer, gram3.seg(seg_test)) print('pos correct ratio:\t', get_pos_correct_ratio(pos_train[:200], gram3.pos(pos_test[:300]))) print('\n4-gram:') # 4元文法分词以及词性标注 gram4 = ngram.NGram(seg_trian, pos_train, n=4) print_ratio(seg_test_answer, gram4.seg(seg_test)) print('pos correct ratio:\t', get_pos_correct_ratio(pos_train[:200], gram4.pos(pos_test[:300]))) # 几个测试用例 print('几个测试用例:') print(gram3.seg(['大连港年吞吐量超七千万吨', '今天同事问了我一道面试题'])) print( gram3.pos([[ '迈向', '充满', '希望', '的', '新', '世纪', '——', '一九九八年', '新年', '讲话', '(', '附', '图片', '1', '张', ')' ], ['希望', '是', '什么', '东西']]))
def learn(): Dict = [] num = 1000 #In the future, a language detector can be used to reduce the amount of data load in memory Dict.append(ngram.NGram(readRealNames('english'))) Dict.append(ngram.NGram(readForeignRealNames('spanish'))) Dict.append(ngram.NGram(readSyntheticNames('english'))) Dict.append(ngram.NGram(readForeignSyntheticNames('spanish'))) #Dict.append(ngram.NGram(readForeignScrappedNames('english'))) #Dict.append(ngram.NGram(readForeignScrappedNames('spanish'))) return Dict
def getNGrams(self, domain): uni_index= ngram.NGram(N=1) bi_index = ngram.NGram(N=2) tri_index = ngram.NGram(N=3) quad_index= ngram.NGram(N=4) unigrams= list(uni_index.ngrams(domain)) bigrams= list(bi_index.ngrams(domain)) trigrams= list(tri_index.ngrams(domain)) quadgrams= list(quad_index.ngrams(domain)) return unigrams, bigrams, trigrams, quadgrams
def __init__(self, dictionary, N, match_threshold = 1.0): """ match threshold from 0.0 - 1.0, where higher value indicates closer n gram distance to be considered as a possible match """ GenericAlgorithm.__init__(self, dictionary) #self.nGram = ngram.NGram([], key=lambda x:x.lower(), N=N) self.n = N self.nGramDict = {} self.threshold = match_threshold self.name = "NGram Distance" # save words in a dictionary of NGram lists based on length of the word for word in dictionary: key = str(len(word)) if key in self.nGramDict.keys(): self.nGramDict[key].add(word) else: self.nGramDict[key] = ngram.NGram(key=lambda x: x.lower(), N=self.n) self.nGramDict[key].add(word) self.match_threshold = match_threshold tFinish = int(time.time() * 1000) self.stats['runtime'] += tFinish - self.timeStart
def bleu(answer, references, theta = 0.0001): references_grams = [] result = [] for reff in references : G = ngram.NGram(tokenizer(reff)) _, expp = M_bleu(answer, [G]) result.append(expp) maxScore = max(result) maxIndex = np.argmax(result) refMax = [] for index, refs in enumerate(result) : if abs(refs - maxScore) < theta : refLength = len(tokenizer(references[index])) candLength = len(tokenizer(answer)) refMax.append((refs, index, brevity_penalty(candLength, refLength))) # Find the selected reference answer SRA_m_bleu = 0 SRA_bp = 0 SRA_index = 0 for (m_bleu, index, bp) in refMax: if bp > SRA_bp : SRA_bp = bp SRA_index = index SRA_m_bleu = m_bleu #print((SRA_m_bleu, SRA_bp, SRA_index)) #return maxScore, maxIndex, refMax return SRA_m_bleu, SRA_bp, index
def match_levenshtein(token): dictSet = getDict() candidates = [] candidatesG = [] bestMatch = "" minDistance = 3 for item in dictSet: distance = Levenshtein.distance(token.lower(), item.lower()) if distance == 0: return item, [], [] elif distance < minDistance: minDistance = distance candidates = [] if distance == minDistance: candidates.append(item.lower()) if len(candidates) > 1: G = ngram.NGram(candidates) candidatesG = G.search(token) if len(candidatesG) > 0: bestMatch = candidatesG[0][0] elif len(candidates) == 1: bestMatch = candidates[0] return bestMatch, candidates, candidatesG
def generate_file(file_name, output_file_name): sheet_input = open_excel(file_name=file_name) end_pos = 10000 faq_data = dict() standard_question_idx = 0 standard_question_to_id = dict() for i in range(2, end_pos): if sheet_input.cell(i, 1).value is None and sheet_input.cell(i, 2).value is None: break standard_question = replace_str(sheet_input.cell(i, 4).value) if standard_question is None or sheet_input.cell(i, 5).value is None: continue if standard_question not in standard_question_to_id: standard_question_to_id[standard_question] = standard_question_idx standard_question_idx += 1 if standard_question not in faq_data: faq_data[standard_question] = [] sim_questions = sheet_input.cell(i, 5).value.split('\n') sim_questions.append(standard_question) faq_data[standard_question].extend( list(set( map(replace_str, filter(lambda x: x != "", sim_questions))))) print(len(faq_data)) faq_keys = dict() for i, (k, vs) in enumerate(faq_data.items()): for v in vs: faq_keys[v] = standard_question_to_id[k] k = 30 total_groups = len(faq_data) training_data = [] for i, key in enumerate(list(faq_data.keys())): if i % 1 == 0: print("{}/{}".format(i, total_groups)) label = standard_question_to_id[key] questions = faq_data[key] corpus_list = [] for items in filter(lambda x: x[0] != key, faq_data.items()): corpus_list.extend(items[1]) if questions[0] in corpus_list: print("impossible") G = ngram.NGram(corpus_list, N=2) cache = dict() for positive_question, anchor_question in combinations(questions, 2): if anchor_question in cache: negative_questions = cache[anchor_question] else: negative_questions = G.search(anchor_question, threshold=0.1) cache[anchor_question] = negative_questions ans_len = min(len(negative_questions), 30) for i in range(ans_len): negative_question = negative_questions[i][0] training_data.append( "{} {} {}\t{}\t{}\t{}".format(label, label, faq_keys[negative_question], positive_question, anchor_question, negative_question)) with open(output_file_name, "w", encoding="utf-8") as fw: for value in training_data: fw.write(value) fw.write("\r\n")
def query_search_test(): print 'generate queries in train set...' sys.stdout.flush() train_queries = [] with open(train_click_log, 'r') as f: for line in f: arr = line.strip().split('\t') query = arr[1].strip() train_queries.append(query) print 'generate queries in dev set...' sys.stdout.flush() dev_queries = [] with open(dev_label, 'r') as f: for line in f: arr = line.strip().split('\t') query = arr[0].strip() dev_queries.append(query) print 'conduct a search...' sys.stdout.flush() G = ngram.NGram(train_queries) print G.NGram.search(dev_queries[0], threshold=0.3)
def partial_words_overlap(answer, thresh=0.7): import ngram # inspiration -> semsim : a multi-feature approach to semantic # text similarity Adebayo Guidi Boella # avg 24 facit = "know amount vinegar container type sort cup material long time minute temperature experiment sample rinse distilled water size surface area drying method" len_answer = len(tokenizer(answer)) answer = remove_shit(answer) answer = remove_stops(tokenizer(answer)) answer = tokenizer(answer) #print("Clean answer : ", answer) facit = tokenizer(facit) len_answer_tokenized = len(answer) len_facit = len(facit) G = ngram.NGram(answer) candidate = 0 cand = [] sum = 0 for word in facit: candidate = G.search(word, threshold=thresh) if len(candidate) > 0: cand.append(candidate) sum = sum + 1 #return cand, sum, sum /(24 + len_facit), sum /(len_answe_tokenized + len_facit) return sum / (24 + len_facit), sum / (len_answer_tokenized + len_facit)
def __init__(self, robot_code, version, interpreter, _nlu_data_path=None): self.interpreter = interpreter self.version = version self.robot_code = robot_code if not _nlu_data_path: nlu_data_path = get_nlu_data_path(robot_code, version) else: nlu_data_path = _nlu_data_path with open(nlu_data_path, "r") as f: raw_training_data = json.load(f) regx = raw_training_data['regex_features'] examples = raw_training_data["rasa_nlu_data"]["common_examples"] intent = {} for example in examples: if example["intent"] not in intent: intent[example["intent"]] = [example["text"]] else: intent[example["intent"]].append(example["text"]) self.intent = intent self.intent_matcher = {intent_id: ngram.NGram( examples) for intent_id, examples in self.intent.items()} self.regx = {key: [re.compile(item) for item in value] for key, value in regx.items()} self.key_words = raw_training_data['key_words'] self.intent_rules = raw_training_data['intent_rules'] self.intent_id2name = raw_training_data.get("intent_id2name", {})
def corpus_based_sim(answer, floor=0.8): import ngram #print(read_corp()) facit = "need know amount vinegar container type material time temperature experiment sample rinse distilled water size surface area drying method" answer = remove_shit(answer) answer = remove_stops(tokenizer(answer)) answer = tokenizer(answer) G = ngram.NGram(answer) #print("Clean answer : ", answer) facit = tokenizer(facit) count = 0 for word in facit: candidate = G.search(word, threshold=floor) if len(candidate) > 0: count = count + 1 else: syn = read_corp(word) for s in syn: candidate = G.search(s, threshold=floor) if len(candidate) > 0: count = count + 1 #print("count = ", count) sim1 = count / max(len(facit), 24) sim2 = count / max(len(facit), len(answer)) return sim1, sim2
def to_ngrams(n, string): ''' Convert string to sequence of ngrams. ''' # TODO: need a LOWERCASE preprocessing step? Maybe in the Sample class? index = ngram.NGram(pad_len=(n - 1), N=n) ngrams = tuple(index.split(string)) return ngrams
def ngramed_list(lst: list, n: int = 3) -> list: """ listをNグラム化する. :param lst Nグラム化対象のリスト :param n N (デフォルトは N = 3) :return Nグラム化済みのリスト """ index = ngram.NGram(N=n) return [term for term in index.ngrams(lst)]
def number_of_bigrams(n, r=2): ''' retorna o numero de ngrams para o tamanho n :param n: tamanho do string :param r: numero de ngrams :return: ''' index = ngram.NGram(N=r) return len(list(index.ngrams(index.pad(str('a' * n)))))
def street_to_numbers_ngrams(self): """ Returns mapping from lower case street name to possible number's ngrams """ str_to_addr_ngram = defaultdict(lambda: NOT_FOUND_) street_groups = self.addr_book.groupby(by=STR_LC, sort=False, group_keys=True) for street_name, group in street_groups: street_addresses = group[BUILD_LC] + '$' + group[APT_LC] st_ngr = ngram.NGram(street_addresses.tolist(), N=3) str_to_addr_ngram[street_name] = st_ngr return str_to_addr_ngram
def break_ties(misspell, matches): G = ngram.NGram(matches) filtered = groupby(G.search(misspell), lambda x: x[1]) filtered_matches = [] for score, group in islice(filtered, 0, 1): for word in group: filtered_matches.append(word[0]) return sorted(filtered_matches)
def run(slicer, slice, first, mhperm, rowsize, encrypt_flag, bf_size, bigrams_flag): mhs = [] for i in range(0, rowsize): mh = MinHash(num_perm=mhperm) mhs.append(mh) sdata = StringIO(slicer.read(slice)) reader = csv.reader(sdata, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL, skipinitialspace=True) #reader = csv.reader(sdata,delimiter=',',quotechar='"',quoting=csv.QUOTE_NONNUMERIC, skipinitialspace=True) if (first): next(reader, None) dics = [] for i in range(0, rowsize): dics.append({}) for row in reader: row_size = len(row) if (rowsize == row_size): for column in range(0, row_size): mh = mhs[column] local_data = '' if encrypt_flag: #TODO: arrumar filtros de bloom if bigrams_flag: bf = encrypt.encryptDataBigram(str(row[column]), bf_size) else: bf = encrypt.encryptDataWords(str(row[column]), bf_size) local_data = str(bf) pass else: #rever isso possivel problem (nao utilizado) import ngram index = ngram.NGram(N=2) bigrams_list = list( index.ngrams(index.pad(str(row[column])))) local_data = str(row[column]) if bigrams_flag: for bigram in bigrams_list: mh.update(bigram.encode('utf8')) mh.update(local_data.encode('utf8')) #return data_stats return mhs
def ngramed(lis): n = 3 index = ngram.NGram(N=n) ngram_returm = [] for term in index.ngrams(lis): ngram_returm.append(term) return ngram_returm
def __init__(self): self.PlurarShapes = open('Dictionaries\\PlurarShape.txt', 'r', encoding="utf8").read().split('\n') self.Shapes = open('Dictionaries\\ShapesTerms.txt', 'r', encoding="utf8").read().split('\n') self.geo = open('Dictionaries\\GeometryTerms.txt', 'r', encoding="utf8").read().split('\n') self.claim = open('Dictionaries\\ClaimTerms.txt', 'r', encoding="utf8").read().split('\n') self.conclusion = open('Dictionaries\\ConclusionTerms.txt', 'r', encoding="utf8").read().split('\n') self.structure = open('Dictionaries\\TaskStructure.txt', 'r', encoding="utf8").read().split('\n') self.software = open('Dictionaries\\SoftwareUsage.txt', 'r', encoding="utf8").read().split('\n') self.nonMath = open('Dictionaries\\NMDTerms.txt', 'r', encoding="utf8").read().split('\n') self.tech = open('Dictionaries\\TechTerms.txt', 'r', encoding="utf8").read().split('\n') self.negTech = open('Dictionaries\\NegativeTechTerms.txt', 'r', encoding="utf8").read().split('\n') self.shorts = open('Dictionaries\\ShortTerms.txt', 'r', encoding="utf8").read().split('\n') self.context = open('Dictionaries\\ContextTerms.txt', 'r', encoding="utf8").read().split('\n') self.stopWords = open('Dictionaries\\StopWords.txt', 'r', encoding="utf8").read().split('\n') self.tecSentences = open('simSentences\\tec.txt', 'r', encoding="utf8").read().split('\n') self.dsSentences = open('simSentences\\ds.txt', 'r', encoding="utf8").read().split('\n') self.gTec = ngram.NGram(self.tecSentences) self.gDs = ngram.NGram(self.dsSentences)
def generate_ngram_model(listoffilenames, N): ngram_model = ngram.NGram(N=N) for filename in listoffilenames: with open(filename, "r") as text_file: for domain_name in text_file: second_level_domain = domain_name.split(".")[0] ngram_model.add(second_level_domain) return ngram_model
def motif(sequences,L_Left,L_UP): len_motifs_sup = {} for i in xrange(L_Left,L_UP+1): motif = Counter() for x in sequences: G = ngram.NGram(N = i) motif.update(G.ngrams(x)) len_motifs_sup[i] = motif #print len_motifs_sup return len_motifs_sup
def top_k_similarity(self, input_text, candidates, k=1): #print(list(zip(range(len(candidates)), candidates))) ngram_index = ngram.NGram(zip(range(len(candidates)), candidates), n=2, key=lambda x: x[1]) ret = ngram_index.search(input_text) ret_list = [(v[0], score) for v, score in ret[:k]] if len(ret_list) == 0: ret_list = [(0, 0)] return ret_list
def master_process(comm, misspellfilepath, correctfilepath, dictpath, size, start_time): #count how many words to be corrected count = 0 print 'start' n = 2 #send gram list of dictionary to slave_process with open(dictpath, 'r') as f: list = f.readlines() for i in range(len(list)): list[i] = list[i].strip() G = ngram.NGram(list, N=n) for i in range(size-1): comm.send(G, dest=(i % (size - 1) + 1), tag=(i % (size - 1) + 1)) #send misspelled words and correct words to slave_process with open(misspellfilepath) as mf: with open(correctfilepath) as cf: for mline in mf: mline= mline.strip() cline = cf.readline().strip() count = count + 1 comm.send(mline, dest=(count % (size - 1) + 1), tag=(count % (size - 1) + 1)) comm.send(cline, dest=(count % (size - 1) + 1), tag=(count % (size - 1) + 1)) print 'all words sent' #ask slave_process to exit for i in range(size-1): comm.send('exit', dest=(i % (size - 1) + 1), tag=(i % (size - 1) + 1)) comm.send('exit', dest=(i % (size - 1) + 1), tag=(i % (size - 1) + 1)) #used to receive data from slave_process correctcount = 0.0 #if one single prediction is correct, count accuracycount = 0.0 #if correct word is in the list of predictions, count recallcount = 0.0 #count the number of predictions predictcount = 0.0 for i in range(size-1): #receive result from slave_process correctcount = comm.recv(source=(i+1),tag = (i+1)) accuracycount = accuracycount + correctcount[0] recallcount = recallcount + correctcount[1] predictcount = predictcount + correctcount[2] #print results print str(n)+'-gram on '+str(size)+' cores' print 'accuracy: '+str(accuracycount/count) print 'precision: '+str(recallcount/predictcount) print 'recall: '+str(recallcount/count) print 'predict: '+str(predictcount) end_time = time.time() print 'spend'+str(end_time-start_time)+'s'
def main(): dictFp = open(sys.argv[1], 'r') inputFp = open(sys.argv[2], 'r') customN = int(sys.argv[3]) threshold = float(sys.argv[4]) d = [] #print("Dictionary Loading") for line in dictFp: d.append(line.strip()) D = ngram.NGram(d, N=customN) #print("Dictionary Loaded") outputDict = defaultdict(list) counter = 0 for line in inputFp: print("#{}\t{}".format(counter, line), file=sys.stderr, end="") try: uid, tid, tweet, d = line.split('\t') except: continue words = tweet.split() for word in words: if (len(word) < 4): continue res = D.search(word, threshold=threshold) valid = True for found, prob in res: #if(found != word): #output.append(found) if (found == word): valid = False break if (len(res) and valid): #and len(output)): res = "" #for w in output: # res += (w + " ") #res += ("\t" + tid) print("> Identified {} as typo".format(word), file=sys.stderr) outputDict[word].append(tid) counter += 1 for (k, v) in outputDict.items(): print("{}\t".format(k), end="") for tid in v: print(tid, end=" ") print("")
def ssdeep_to_int_ngram(ssdeep_hash): G = ngram.NGram(N=7, pad_len=0) ssdeep_7grams = [] ssdeep_parts = ssdeep_hash.split(":") for seven_gram in G.split(ssdeep_parts[1]): ssdeep_7grams.append(seven_gram_to_int(seven_gram)) for seven_gram in G.split(ssdeep_parts[2]): ssdeep_7grams.append(seven_gram_to_int(seven_gram)) return ssdeep_7grams
def apply_ngram(misspell, dictionary): G = ngram.NGram(dictionary) count = 0 result = [] for mis_word in misspell: if mis_word not in dictionary: if '/' not in mis_word: """search a list of approximate words using ngram search""" pred_words = [] if G.search(mis_word, threshold=0.4): search_result = G.search(mis_word, threshold=0.4) try: search_result[0][1] except: search_result = (mis_word, 1) else: highest_score = search_result[0][1] for (w, s) in search_result: if math.isclose(s, highest_score): pred_words.append(w) if len(pred_words) == 1: result.append(pred_words[0]) else: result.append(pred_words) else: result.append(G.find(mis_word)) else: multi_words = mis_word.split('/') tmp = '' for w in multi_words: if w: approx_w = G.find(w) tmp += (approx_w + '/') elif len(w) == 3: # for w is like /i/ tmp = w else: continue tmp = tmp[:-1] result.append(tmp) else: result.append(mis_word) count += 1 print("Processing: {} / {}".format(count, len(misspell)), end='\r') return result
def match_double_metaphone(token): dictSet = getDict() candidates = [] candidatesG = [] class1 = [] class2 = [] class3 = [] hasClass1 = False hasClass2 = False bestMatch = "" dmeta = fuzzy.DMetaphone() dm_token = dmeta(token) dm_token_pk = dm_token[0] dm_token_sk = dm_token[1] for match in dictSet: dm_match = dmeta(match) dm_match_pk = dm_match[0] dm_match_sk = dm_match[1] if (dm_token_pk != 'None') and (dm_token_pk == dm_match_pk): hasClass1 = True class1.append(match) continue if (not hasClass1) and ( (dm_token_pk != 'None' and dm_token_pk == dm_match_sk) or (dm_token_sk != 'None' and dm_token_sk == dm_match_pk)): hasClass2 = True class2.append(match) continue if (not hasClass2) and (dm_token_sk != 'None' and dm_token_sk == dm_match_sk): class3.append(match) if hasClass1: candidates = class1 elif hasClass2: candidates = class2 else: candidates = class3 if len(candidates) > 1: G = ngram.NGram(candidates) candidatesG = G.search(token) if len(candidatesG) > 0: bestMatch = candidatesG[0][0] elif len(candidates) == 1: bestMatch = candidates[0] return bestMatch, candidates, candidatesG
def try_all(self, answer): compound = list() for f in self.facit: f = nltk.word_tokenize(f) G = ngram.NGram(f) fas = list() for a in answer: fas.extend(G.search(a)) s = 0 for word, weight in fas: s = s + weight compound.append(s) return compound
def myfunction(seq): if isinstance(seq, str): print("===文字bi-gram===") index = ngram.NGram(N=2) for term in index.ngrams(index.pad(seq)): print(term) elif isinstance(seq, list): print("===単語bi-gram===") for (word, count) in zip(seq, range(len(seq))): sec_word = "" if len(seq) >= count and count <= len(seq) - 2: sec_word = "-" + seq[count + 1] print(word + sec_word)