def find_all(self): cursor = self.find() software = cursor.distinct('software_name') os = cursor.distinct('os_name') company = cursor.distinct('company_name') cursor.close() return flatten(flatten(software,os),company)
def compare(matrix, possible_matrix_list): cost = [] for possible_matrix in possible_matrix_list: matrix_cost = 0 possible_matrix = flatten(possible_matrix) final_matrix = flatten(matrix) for i, j in zip(final_matrix, possible_matrix): if i == b: pass elif i != j: matrix_cost += 1 cost.append(matrix_cost) return cost
def compare(matrix,possible_matrix): cost=[] for possible in possible_matrix: mat_cost=0 possible=flatten(possible) matrix=flatten(matrix) for i,j in zip(matrix,possible): if i==b: pass elif(i!=j): mat_cost+=1 cost.append(mat_cost) return cost
def __iter__(self): n=len(self.uniblocks) indreorder=torch.randperm(n).tolist() batch=[] for idx in indreorder: indnew=list(range(idx*self.blocksize,(idx+1)*self.blocksize)) batch.append(indnew) if len(batch)==self.nblock: ##number of block in each minibatch yield flatten(batch) batch=[] if len(batch) > 0: yield flatten(batch)
def extract_zip_code(texts): ''' Concat the address without zip code and the zip code from zip_code_submethod method. Args: texts (string): addresses with and without seprated by delimeter. Returns: concat_address (string) : address concat with delimeter concat_postal_code (string) : concat list of zipcode ''' delimeter = '|' address_list, postal_code_list = list(), list() if str(texts) != 'nan' and texts != 'N/A' and texts != None: if delimeter in texts: for text in texts.split(delimeter): address_list_val, postal_code_val = zip_code_submethod( Constants.zip_code_regex, text) address_list.append(address_list_val) postal_code_list.append(postal_code_val) else: postal_code = re.findall(Constants.zip_code_regex, texts) address_list_val, postal_code_val = zip_code_submethod( Constants.zip_code_regex, texts) address_list.append(address_list_val) postal_code_list.append(postal_code_val) concat_address = delimeter.join(address_list) concat_postal_code = flatten(postal_code_list) return concat_address, concat_postal_code
def process_natural_text(body): toks_ = [[ l.strip() for l in re.findall(CODE_TOKENISATION_REGEX, line.strip()) if len(l.strip()) > 0 ] for line in body.split('\n')] toks_ = flatten(toks_) return [(w, t, 0) for w, t in pos_tag(toks_, tagset='universal')]
def getSortedCountMap(col, rx): backupColName = 'backup_' + col train[backupColName] = train[col].apply(lambda x: toJson(x, rx) if type(x) is str else []) sortedList = sorted(flatten(list(train[backupColName].values))) countMap = [(i, len(list(c))) for i, c in groupby(sortedList)] return sorted(countMap, key=lambda x: x[1], reverse=1)
def preprocess(self,text): check = re.search(r'([a-z])\1+',text) if check: if len(check.group())>2: text = re.sub(r'([a-z])\1+', lambda m: m.group(1), text, flags=re.IGNORECASE) #remove các ký tự kéo dài như hayyy,ngonnnn... text = text.strip() #loại dấu cách đầu câu text = text.lower() #chuyển tất cả thành chữ thường text = re.sub('< a class.+</a>',' ',text) for k, v in self.replace_list.items(): #replace các từ có trong replace_list text = text.replace(k, v) text = re.sub(r'_',' ',text) text = ' '.join(i for i in flatten(tokenize(text).split(" "))) #gán từ ghép for i in self.Pos_list: #thêm feature positive if re.search(' '+i+' ',text): text = re.sub(i,i+' positive ',text) for i in self.Neg_list: #thêm feature negative if re.search(' '+i+' ',text): text = re.sub(i,i+' negative ',text) return text
def get_traindata(input_files): if (len(input_files) == 0): raise Exception('empty values') list_filedata = [] list_filepaths = [] #print(input_files) for eachfile in input_files: for filename in eachfile: filename = filename filepaths = glob.glob(str(filename)) #print(filepaths) list_filepaths.append(filepaths) filepaths_list = nltk.flatten(list_filepaths) filepaths_list = filepaths_list[:200] print('Text files in Train data', len(filepaths_list)) #print(filepaths_list) for filepath in filepaths_list: fileopen = open(filepath) file_data = fileopen.read() file_data = file_data.replace("<br />", " ") #file_data = re.sub(r'[^a-zA-Z0-9]'," ",file_data) list_filedata.append(file_data) fileopen.close() #print(list_filedata[0:10]) return list_filedata
def Update_Output(inputfiles,files_data,outputpath): # print((outputpath)) filenames =[] files = nltk.flatten(inputfiles) for i in range(len(files)): input_files = glob.glob(files[i]) # print(input_files) for j in range(len(input_files)): # print(type(input_files[j])) if '.txt' in input_files[j]: input_files[j] = input_files[j].replace(".txt", ".redacted.txt") if '.md' in input_files[j]: input_files[j] = input_files[j].replace(".md", ".redacted.txt") if '\\' in input_files[j]: input_files[j]= input_files[j].split("\\") input_files[j] = input_files[j][1] # print(input_files[j]) filenames.append(input_files[j]) for i in range(len(files_data)): for j in range(len(filenames)): if i==j: file_data =files_data[i] # print((filenames[i])) path1 = (os.getcwd()) # print(outputpath+filenames[j]) path2 = (outputpath+'/'+filenames[j]) final_file = open(os.path.join(path1,path2), "w" ,encoding="utf-8") # print(os.path.join(path1,path2)) final_file.write(file_data) final_file.close() return len(filenames)
def get_partial_template(formId, count, key, value): s = '' for i in value: # print(f"{key} - {i}") if not i: pass elif len(i) == 1: # single <p> paragraph count += 1 s += f"\n\n<p>{get_message_with_affinity(formId, count)}</p>" elif isinstance(i[0], str) and i[0].endswith(':'): # a <ul> list if len(i) == 2: result, count = get_list_template(i, formId, count, False) s += f"\n\n<p>{result}</p>" else: result, count = get_list_template(i[:2], formId, count, False) s += f"\n\n<p>{result}</p>" count, ts = get_partial_template(formId, count, key, [i[2:]]) s += f"{ts}" elif i == flatten(i) and index_of_urls( i): # paragraph <p> with <a> links count, ts = get_link_template(i, formId, count + 1, index_of_urls(i)) if len(index_of_urls(i)) == 1: s += f"\n\n<p>@genericLink(params, \"{formId}\", {ts})</p>" else: s += f"\n\n{ts}" else: print( "ERROR : An unhandled variation was encountered when generating GUIDE PAGE TEMPLATE. This is most likely due to a missing html tag in guide page messages in dfs-frontend." ) return count, s
def get_hypernyms(synset): hypernyms = set() for hyponym in synset.hypernyms(): hypernyms |= set(get_hypernyms(hyponym)) result_syns = hypernyms | set(synset.hypernyms()) result = set(flatten([list(x.lemmas()) if isinstance(x, Synset) else x for x in result_syns])) return result
def get_all_tokens(c): res = c.copy() children = nltk.flatten([token.children for token in c]) if len(children) > 0: res += UDPipeKeywordsExtractor.get_all_tokens(children) return res
def get_antonyms(synset): antonyms = set() new_antonyms = set() for lemma in synset.lemmas(): new_antonyms |= set(lemma.antonyms()) antonyms |= new_antonyms for antonym in new_antonyms: antonyms |= set(flatten([list(x.lemmas()) for x in antonym.synset().similar_tos()])) return antonyms
def get_all_tokens(x): res = x.copy() children = nltk.flatten([token.children for token in x]) if len(children) > 0: res += UDPipeModel.get_all_tokens(children) return res
def input(files): Read_data = [] x = [] for j in files: for file in j: data = glob.glob(str(file)) x.append(data) y = nltk.flatten(x) for i in y: Read_data.append(open(i).read()) return(Read_data)
def evaluate(self, X): """ Evaluate the classifier. :param X: data to test on :return: evaluation results """ features, labels = separate_labels_from_features(X) # get predictions for data y = self.predict(features) n_sent_correct = 0 num_sent = len(y) for i in range(len(labels)): if labels[i] == y[i]: n_sent_correct += 1 labels = nltk.flatten(labels) y = nltk.flatten(y) print("F1 score:") print( sklearn.metrics.precision_recall_fscore_support(labels, y, average='micro')) print() print("Accuracy:") print(sklearn.metrics.accuracy_score(labels, y)) print() print("Sentence level accuracy:") print(n_sent_correct / num_sent) print() print("F1 score per class:") print(sklearn.metrics.precision_recall_fscore_support(labels, y)) print() print("Confusion matrix:") cfm = sklearn.metrics.confusion_matrix(labels, y) plot_confusion_matrix(cfm, np.unique(labels))
def tokenizeSentence(input): sentences = [ re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', repl=' ', string=x).strip().split(' ') for x in input.split('\n') if not x.endswith('writes:') ] sentences = [x for x in sentences if x != ['']] flat_list = flatten(sentences) flat_list = [x for x in flat_list if x != ''] stopwords_german = set(stopwords.words('german')) filtered_tokens = [w for w in flat_list if not w in stopwords_german] return filtered_tokens
def extract(self, keywords, threshold=3, sampler=Sampler.last.value): """ Extracts the relevant phrases to use for comparison with an incoming query for computing veracity keywords (list) - Query of keywords threshold (int) - Controls number of sentences to return (-1 for all sentences) sampler(Sampler) - Returns first-n, last-n, random-n or best-n number of sentences (heuristic) return (list) - List of phrases and sentences to use for veracity computation """ sentences = TextBlob( self.body).sentences # Extract sentences from article body relevant = {} for sentence in sentences: sentence = str(sentence).decode('utf-8').lower().strip() tokens = word_tokenize(sentence) tokens = [t.lower() for t in tokens if t not in punctuation] common = len([value for value in keywords if value in tokens]) if common == 0: continue if common not in relevant.keys(): relevant[common] = [] relevant[common].append(' '.join(tokens)) if threshold == -1: return flatten(relevant.values()) if sampler == Sampler.last.value: return flatten(relevant.values())[-threshold:] elif sampler == Sampler.first.value: return flatten(relevant.values())[:threshold] elif sampler == Sampler.random.value: shuffle(flatten(relevant.values())) return relevant[-threshold:] elif sampler == Sampler.best.value: result = [] for key in sorted(relevant.keys(), reverse=True): # Sort by keyword hits result.extend(relevant[key]) if len(result) >= threshold: break return result[:threshold]
def get_pertainyms(synset): pertainyms = set() new_pertainyms = set() for lemma in synset.lemmas(): new_pertainyms |= set(lemma.pertainyms()) pertainyms |= new_pertainyms for pertainym in new_pertainyms: pertainyms |= set( flatten([ list(x.pertainyms()) for x in pertainym.synset().similar_tos() ])) return pertainyms
def generate_guide_template(formId, userType, stats): folder = templateUrl + '/app/uk/gov/hmrc/dfstemplaterenderer/templates/guidePageTemplates' + f"/{formId}" if not os.path.exists(folder): os.mkdir(folder) pInfo(f"Folder {formId} created") if not os.path.exists(folder + f"/{userType}"): os.mkdir(folder + f"/{userType}") pInfo(f"Folder {userType} created") if os.path.isfile(folder + f"/{userType}/{formId}.scala.html"): pWarn('Guide page template already exists') else: f = open(folder + f"/{userType}/{formId}.scala.html", 'w') f.writelines(get_copyright()) f.writelines([ "\n\n@import play.twirl.api.Html", "\n@import uk.gov.hmrc.dfstemplaterenderer.templates.guidePageTemplates.helpers.genericHelpers.html._", "\n@import uk.gov.hmrc.dfstemplaterenderer.utils._", "\n@import uk.gov.hmrc.dfstemplaterenderer.models.LinkTemplate", "\n\n@(params: Map[String, Any])", f"\n\n@baseGenericGuidePageHeader(params, \"{formId}\")" ]) print(f"\n\nstats:{stats}") count = 0 for key, value in stats.items(): print(f"k = {key}, v = {value}") if key == 'list': f.write(f"\n\n@noLinkList(params, \"{formId}\", Seq(") for i in range(1, len(flatten(value))): if i > 1: f.write(", ") count += 1 f.write(f"\"guide.{count:02d}\"") f.write("))") elif key == 'extraInfo' or key == 'beforeStart': if key == 'beforeStart': f.write( f"\n\n@baseGenericGuidePageBody(params, \"{formId}\")") count, text = get_partial_template(formId, count, key, value[1:]) f.write(text) else: pError( "The detected set of messages are not of types: header, list, extraInfo or beforeStart." ) if userType == 'Individual': f.write( "\n\n<p>@MessagesUtils.getCommonMessages(\"page.guide.youCanTrack\", {params(\"langLocaleCode\")}.toString) <a href=\"@Links.ptaLink\">@MessagesUtils.getCommonMessages(\"abandon.pta.link.msg\", {params(\"langLocaleCode\")}.toString)</a> </p>" ) f.close()
def get_derivationally_related_forms(synset): derivationally_related_forms = set() new_derivationally_related_forms = set() for lemma in synset.lemmas(): new_derivationally_related_forms |= set( lemma.derivationally_related_forms()) derivationally_related_forms |= derivationally_related_forms for derivationally_related_form in derivationally_related_forms: derivationally_related_forms |= set( flatten([ list(x.derivationally_related_forms()) for x in derivationally_related_forms.synset().similar_tos() ])) return derivationally_related_forms
def create_multi_batch(titles, bodies, padding_id, pad_left, generated_questions): questions_count = [len(gq) + 1 for gq in generated_questions] titles = flatten(zip(titles, generated_questions)) bodies = flatten([[b] * questions_count[i] for (i, b) in enumerate(bodies)]) assert len(titles) == len(bodies) assert sum(questions_count) == len(titles) max_title_len = max(1, max(len(x) for x in titles)) max_body_len = max(1, max(len(x) for x in bodies)) if pad_left: titles = np.column_stack([ np.pad(x, (max_title_len - len(x), 0), 'constant', constant_values=padding_id) for x in titles ]) bodies = np.column_stack([ np.pad(x, (max_body_len - len(x), 0), 'constant', constant_values=padding_id) for x in bodies ]) else: titles = np.column_stack([ np.pad(x, (0, max_title_len - len(x)), 'constant', constant_values=padding_id) for x in titles ]) bodies = np.column_stack([ np.pad(x, (0, max_body_len - len(x)), 'constant', constant_values=padding_id) for x in bodies ]) return titles, bodies, questions_count
def wordnet_lookup_xnyms (index_to_tokens, fun): xnym_dict = OrderedDict() lemma_vocab = set (porter.stem(word) for word in index_to_tokens.values()) for token in lemma_vocab: xnyms_syns = set() for syn in wordnet.synsets(token): xnyms_syns |= fun(syn) lemmas = set(flatten([list(x.lemmas()) if isinstance(x, Synset) else x for x in xnyms_syns])) strings = [split_multi_word(x.name()) for x in lemmas] xnym_dict[(token,)] = strings return xnym_dict
def pair_spans(self, spans: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]: indices = sorted( list( set( flatten([ list(range(d['start'], d['end'])) for d in spans if d['able'] ])))) rs = ranges(indices) paired_spans = [[ d for d in spans if d['start'] >= r_start and d['end'] <= r_end and d['able'] ] for r_start, r_end in rs] return paired_spans
def output(files, data, name): allfiles = [] for i in files: for file in i: allfiles.append(glob.glob(file)) flattenf = nltk.flatten(allfiles) newfilepath = os.path.join(os.getcwd(), name) for j in range(len(flattenf)): getpath = os.path.splitext(flattenf[j])[0] getpath = os.path.basename(getpath) + '.redacted' if not os.path.exists(newfilepath): os.makedirs(newfilepath) with open(os.path.join(newfilepath, getpath), 'w') as temp: temp.write(data[j]) elif os.path.exists(newfilepath): with open(os.path.join(newfilepath, getpath), 'w') as temp: temp.write(data[j])
def main(): papers_df = readData(["pdf_json", "pmc_json"], cfg["data_path"]) papers_df[ "abstract_fullText"] = papers_df["abstract"] + papers_df["full_text"] papers_df.drop(columns=['title', 'abstract', 'full_text'], inplace=True) text_preprocessor = Preprocessor() papers_df['abstract_fullText_Cleaned'] = papers_df[ 'abstract_fullText'].map(lambda text: text_preprocessor. clean_and_tokenize(text, stop=False, lowercase=True, removeUrls=False, remove_html=False, lemmatize=False, remove_numbers=False, tokenize=False)) terms = papers_df['abstract_fullText_Cleaned'].map( lambda text: text_preprocessor.clean_and_tokenize(text, stop=True, lowercase=False, removeUrls=True, remove_html=True, lemmatize=False, remove_numbers=True, tokenize=True)) arrayTerm = terms.ravel().tolist() flattenedTerms = flatten(arrayTerm) ## Create Vocabulary vocabulary = set(flattenedTerms) vocabulary = list(vocabulary) # Intializating the tfIdf model tfidf = TfidfVectorizer(vocabulary=vocabulary, dtype=np.float32, min_df=0, max_df=0.8, use_idf=True, smooth_idf=True, sublinear_tf=True) # Fit the TfIdf model tfidf.fit(papers_df.abstract_fullText_Cleaned) dictIDF = dict(zip(tfidf.vocabulary_, tfidf.idf_)) save_obj(dictIDF, "dictIDF", 'txt') print("Dictionary successfully created")
def Reading_input(inputfiles): # print(inputfiles) files_data = [] files = nltk.flatten(inputfiles) for i in range(len(files)): # print(files[i]) input_files = glob.glob(files[i]) # print(input_files1) for j in range(len(input_files)): # print(input_files1[j]) data = open(input_files[j]).read() # print(data) files_data.append(data) # print(type(files_data)) # print(len(files_data)) return files_data
def data_input(input_files): if (len(input_files) == 0): raise Exception('empty values') list_filedata = [] list_filepaths = [] for eachfile in input_files: for filename in eachfile: filename = filename filepaths = glob.glob(str(filename)) list_filepaths.append(filepaths) filepaths_list = nltk.flatten(list_filepaths) for filepath in filepaths_list: fileopen = open(filepath) file_data = fileopen.read() list_filedata.append(file_data) fileopen.close() return list_filedata
def run(self, min_wd_cnt = 5, stem = True, spelling_correct = True, folds = 10): """ We don't want to remove stop words """ sentences, tagged_sentences = self.load_tagged_sentences() processed_sentences = self.process_sentences(sentences, spelling_correct, stem, min_wd_cnt) sentence_features = np.asarray( map(self.features_for_sentence, processed_sentences)) cross_validation_ixs = cross_validation(range(len(sentences)), folds) codes = sorted(set(flatten(tagged_sentences))) for code in codes: code_tags = self.tags_for_code(code, tagged_sentences) pass pass
def file_output(files, data, filename): list_files = [] for i in files: for file in i: list_files.append(glob.glob(file)) flattenfiles = nltk.flatten(list_files) newfilepath = os.path.join(os.getcwd(), filename) j = 0 while j < len(flattenfiles): getpath = os.path.splitext(flattenfiles[j])[0] getpath = os.path.basename(getpath) + '.redacted' if not os.path.exists(newfilepath): os.makedirs(newfilepath) with open(os.path.join(newfilepath, getpath), 'w') as outputfile: outputfile.write(data[j]) elif os.path.exists(newfilepath): with open(os.path.join(newfilepath, getpath), 'w') as outputfile: outputfile.write(data[j]) j = j + 1
def chunks(file): f = open(file) raw = f.read() tokens = nltk.word_tokenize(raw) tagged_tokens = nltk.pos_tag(tokens) # The patter for this grammar is repeated twice in order to only find noun phrases with two or more words grammar = "NP: {<JJ.*>*<NN.*>+ <JJ.*>*<NN.*>+}" # Other possible grammars: # grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}" # grammar = r""" # NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns # {<NNP>+} # chunk sequences of proper nouns # """ # grammar = r""" # NP: {<DT><NN.*><.*>*<NN>} # }<VB.*>{ # """ cp = nltk.RegexpParser(grammar) chunks_tree = cp.parse(tagged_tokens) # The result from the chunk parser is a tree. Here I'm finding all the Noun Phrases subtrees, # flattening them into lists, and converting those lists to tuples. This way we end up with the # same data structure that we get from pos_tag() np_subtrees = list(chunks_tree.subtrees(filter=lambda x: x.node=='NP')) flatten_np_subtrees = [tuple(nltk.flatten(t)) for t in np_subtrees] result = [] for item in flatten_np_subtrees: noun_phrase = '' for n in range(len(item)): if n % 2 == 0: noun_phrase += item[n] noun_phrase += ' ' result.append((noun_phrase.rstrip(), 'NP')) return result
def chunks(file): f = open(file) raw = f.read() tokens = nltk.word_tokenize(raw) tagged_tokens = nltk.pos_tag(tokens) # The patter for this grammar is repeated twice in order to only find noun phrases with two or more words grammar = "NP: {<JJ.*>*<NN.*>+ <JJ.*>*<NN.*>+}" # Other possible grammars: # grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}" # grammar = r""" # NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns # {<NNP>+} # chunk sequences of proper nouns # """ # grammar = r""" # NP: {<DT><NN.*><.*>*<NN>} # }<VB.*>{ # """ cp = nltk.RegexpParser(grammar) chunks_tree = cp.parse(tagged_tokens) # The result from the chunk parser is a tree. Here I'm finding all the Noun Phrases subtrees, # flattening them into lists, and converting those lists to tuples. This way we end up with the # same data structure that we get from pos_tag() np_subtrees = list(chunks_tree.subtrees(filter=lambda x: x.node == 'NP')) flatten_np_subtrees = [tuple(nltk.flatten(t)) for t in np_subtrees] result = [] for item in flatten_np_subtrees: noun_phrase = '' for n in range(len(item)): if n % 2 == 0: noun_phrase += item[n] noun_phrase += ' ' result.append((noun_phrase.rstrip(), 'NP')) return result
def top_ten(self): from operator import itemgetter d = self._group(key='software_name') d=flatten(d,self._group('os_name')) proper = [] extras = [] s = set() for map in d: proper.append(dict(name=map['os_name'] if 'os_name' in map else map['software_name'], count=map['count'])) for map in proper[:]: if isinstance(map['name'],list): extras.append(map) proper.remove(map) else: s.add(map['name']) for map in extras: self._aggregate(key='name', map=map, set=s, list=proper) return sorted(proper, key=itemgetter('count'), reverse=True)[:10]