def generate_table(self, txt): pageid = unicode(self.page["parse"]["pageid"]) timestamp = self.revision["query"]["pages"][pageid]["revisions"][0][ "timestamp"] table = [] for i, v in enumerate(self.section_text): for j, w in enumerate(self.section_text[i]): for k, t in enumerate(self.section_text[i][j]): sentence = {} sentence["entity_id"] = self.page["parse"]["pageid"] sentence["revision_id"] = self.page["parse"]["revid"] sentence["timestamp"] = timestamp sentence["entity_title"] = self.page["parse"]["title"] sentence["section_id"] = i sentence["section"] = self.section_name[i] sentence["prg_idx"] = j sentence["sentence_idx"] = k sentence["statement"] = self.section_text[i][j][k] sentence["citations"] = self.citation[i][j][k] table.append(sentence) data = pd.DataFrame(table, columns=[ "entity_id", "revision_id", "timestamp", "entity_title", "section_id", "section", "prg_idx", "sentence_idx", "statement", "citations" ]) data.to_csv(txt, sep="\t", index=False, encoding="utf-8")
def main(): global prd_entry, stopwords, lemmatizer, tagger tagger = PerceptronTagger() lemmatizer = WordNetLemmatizer() data = pandas.read_csv('data.csv') #removes stopwords stopwords = set(corpus.stopwords.words('english')) #reads data from data.csv data['product_title'] = data['product_title'].map(lambda x: clean_entry(x)) data['product_title'] = data['product_title'].map( lambda x: set(word_tokenize(x)) - stopwords).map(lambda l: ' '.join(l)) data['product_description'] = data['product_description'].map( lambda x: clean_entry(x)) data['product_description'] = data['product_description'].map( lambda x: set(word_tokenize(x)) - stopwords).map(lambda l: ' '.join(l)) data['search_term'] = data['search_term'].map(lambda x: clean_entry(x)) data['search_term'] = data['search_term'].map( lambda x: set(word_tokenize(x)) - stopwords).map(lambda l: ' '.join(l)) data = data.dropna() #writes the data to new csv file data.to_csv('clean_data.csv')
def convert_item_name(file): data = pd.read_csv(file) item_name_pattern = re.compile('[A-Z][0-9][a-z]?$') for i, row in data.iterrows(): if item_name_pattern.match(row['item_name']): print(row['item_name']) new_item_name = row['item_name'][0] + '0' + row['item_name'][1:] data.at[i, 'item_name'] = new_item_name data.to_csv(file, encoding='utf-8', index=False)
def predict(self, input_dir, output_dir, rw_type, input_format, chunk_len=100, test_scores=False, output_confidence=False, special_model_path=None): """ tags each file in the input directory (txt or tsv files) and writes the results to output_dir. Also adds a folder "result_stats" with runtime information to the output_dir tsv files must have at least the columns 'tok' and 'sentstart' :param input_dir: string value: path to input directory :param output_dir: string value: path to output directory :param rw_type: string value: direct, indirect, freeIndirect or reported :param input_format: string value: txt or tsv :param chunk_len: :return: """ # time the prediction start_time = datetime.datetime.now().replace(microsecond=0) # create a subdir for testing and overview information in the outputdir result_subdir = "result_stats" if not os.path.exists(os.path.join(output_dir, result_subdir)): os.makedirs(os.path.join(output_dir, result_subdir)) # load the model # determine the current script path curr_path = os.path.dirname(os.path.abspath(__file__)) if special_model_path is None: model_path = os.path.join(curr_path, "models", rw_type, "final-model.pt") else: model_path = os.path.join(curr_path, "models", special_model_path, "final-model.pt") if not os.path.exists(model_path): logging.warning( "Predicting {} aborted. Model not found at path '{}'. Please download a model and put it into " "the appropriate directory. The model file must be named final-model.pt." .format(rw_type, model_path)) else: self.logger.info("loading model {}".format(model_path)) model = SequenceTagger.load(model_path) self.logger.info("model loaded") # if test mode, collect score data (initialize in any case) score_dict = {"file": [], "f1": [], "precision": [], "recall": []} all_predictions_df = pd.DataFrame() input_files = [x for x in os.listdir(input_dir)] for file in input_files: resfile_name = re.sub("\..+$", ".tsv", file) self.logger.info("predicting {}".format(file)) # read the file and convert to dataframe if input_format == "txt": data = self.convert_txtfile_to_dateframe( os.path.join(input_dir, file)) else: data = pd.read_csv(os.path.join(input_dir, file), sep="\t", quoting=3, encoding="utf-8", na_values=[]) # check for tok column: if "tok" not in data.columns: self.logger.warning( "Column 'tok' is missing in file {}. File will be skipped." .format(file)) else: if "sentstart" not in data.columns: self.logger.warning( "Column 'sentstart' is missing in file {}. Will be added with default values (all 'no')." .format(file)) data["sentstart"] = ["no"] * len(data) self.logger.debug("TEST: data head:\n {}".format( data.head(10))) # create sentlist (based on max chunk length) sent_list = self.create_sentlist_from_file_batchmax( data, maxlen=chunk_len, compare_column="NaN") # predict res_dict = { "tok": [], rw_type + "_pred": [], rw_type + "_conf": [] } for sent in sent_list: model.predict(sent) pred_conf_list = [ x["labels"] for x in sent.to_dict(tag_type="cat")["entities"] ] pred_list = [ x[0].to_dict()["value"] for x in pred_conf_list ] conf_list = [ x[0].to_dict()["confidence"] for x in pred_conf_list ] res_dict["tok"].extend([ x["text"] for x in sent.to_dict(tag_type="cat")["entities"] ]) res_dict[rw_type + "_conf"].extend(conf_list) res_dict[rw_type + "_pred"].extend(pred_list) pred_df = pd.DataFrame(res_dict) # create output # if there is a missmatch in file length after prediction, still save the results if (len(data) != len(pred_df)): self.logger.warning( "File length changed when predicting for file {} (before: {}, after: {})\n" "Result file will be saved with prefix 'warn_'; additional columns are lost." .format(file, len(data), len(pred_df))) pred_df.to_csv(os.path.join(output_dir, "warn_" + resfile_name), index=False, sep="\t") # if everything is okay, add the new column(s) to the original data and save else: if output_confidence: data[rw_type + "_conf"] = pred_df[rw_type + "_conf"] data[rw_type + "_pred"] = pred_df[rw_type + "_pred"] data.to_csv(os.path.join(output_dir, resfile_name), index=False, sep="\t", encoding="utf-8") # calculate the testscores: if test_scores: self.logger.info( "Calculate scores for {}".format(file)) if rw_type in data.columns and rw_type + "_pred" in data.columns: data, f1, prec, rec = self.calculate_scores( data, rw_type) score_dict["file"].append(file) score_dict["f1"].append(f1) score_dict["precision"].append(prec) score_dict["recall"].append(rec) all_predictions_df = all_predictions_df.append( data) else: self.logger.warning( "Skipping test scores for file {}: Missing column {} and/or {}" .format(file, rw_type, rw_type + "_pred")) end_time = datetime.datetime.now().replace(microsecond=0) # write an overview file when the process is finished res_text = "RW Tagger (predict): Model {}\n" \ "Predict time:\nstart: {}nend:{}\ntotal: {}" \ .format(model_path, start_time, end_time, end_time - start_time) # if in test mode, calculate the final scores (for all the data) and save the test score df if test_scores: self.logger.info("Calculate total scores") if len(all_predictions_df) > 0: self.logger.debug("all_predictions_len: {}".format( len(all_predictions_df))) all_predictions_df, f1, prec, rec = self.calculate_scores( all_predictions_df, rw_type) score_dict["file"].append("total") score_dict["f1"].append(f1) score_dict["precision"].append(prec) score_dict["recall"].append(rec) score_df = pd.DataFrame(score_dict) score_df.to_csv(os.path.join(output_dir, result_subdir, rw_type + "_test_scores.tsv"), index=False, sep="\t", encoding="utf-8") res_text += "\nTotal test scores (for detailed scores see {}_test_scores.tsv):\n" \ "f1: {}, precision: {}, recall: {}".format(rw_type, f1, prec, rec) self.logger.info( "Total scores for {}: f1: {}, precision: {}, recall: {}" .format(rw_type, f1, prec, rec)) with open(os.path.join(output_dir, result_subdir, rw_type + "_overview.txt"), "w", encoding="utf-8") as f: f.write(res_text)
import re from nltk.util import ngrams from nltk.corpus import PlaintextCorpusReader, stopwords stops = stopwords.words('english') + ['thou','thy'] #get the list of directories dirs = glob('*/') punctuation = re.compile(r'[\W]') punct = [',','.','&','{','}','?',"'",'-',';',':','|','(',')','[',']'] #for everything in the list names=dict() for d in dirs: #make each of the directories a temporary corpus #also make a list of all the n-grams for 2<=n<=5, for common phrasings name = d[:-1] words = [list() for i in range(2,5)] with cd(d): corpus = PlaintextCorpusReader('./','.*') tokens =[token for token in corpus.words() if not punctuation.match(token)] grams = list(ngrams(tokens,2)) n=len(grams) gramscount= [(j,grams.count(j)/n) for j in [('your','self'),('our','selfe'),('him','self')]] stuff = corpus.words() morphemes = [l for l in stuff if l not in stops and l not in punct] #make a pseudo-dictionary for the single words n=len(morphemes) series = [(l,morphemes.count(l)/n) for l in ['have','haue','good','goode','never','neuer','come','cum','unto','vnto','which','whych','kyng','king','kynge','verbe','verb','whiche','hath','has','lorde','lord','yourself','ourself','himself']] series = dict(series+gramscount) names[name]=series data = pd.DataFrame(names) data.to_csv(path_or_buf='spellingData.csv')
l = [] tags = [] for i in range(len(data['sentence'])): text = data['sentence'][i] term1 = data['term1'][i] term2 = data['term2'][i] relation = data['relation'][i] tags = tag_set(text, term1, term2, relation) print(i) custom_sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") tokenized = custom_sentence_tokenizer.tokenize(text) k = 0 word = nltk.word_tokenize(text) tagged = nltk.pos_tag(word) for j in tagged: w, pos = j s = 'Sentence: '+str(i) l.append([s, w, pos, tags[k]]) k +=1 columns= ['Sentence #', 'Word','POS', 'Tag'] data = pd.DataFrame(l, columns=columns) data.to_csv('crf_dataset.csv', encoding='UTF-8', index=False)
tokens = [token for token in tokens if len(token) > 4] tokens = [token for token in tokens if token not in en_stop] tokens = [get_lemma(token) for token in tokens] return tokens #.................................................................................................................. # Starts from here data = pd.read_csv("path to dataset") datas = data.values aa = np.array(datas) data = pd.DataFrame(aa) data[1] = pd.to_datetime(data[1]) # sorting data w.r.t date data = data.sort_values(1) data.to_csv('new_file.csv', index=False) f = open('new_file.csv') path = os.path.realpath(f.name) quar = pd.read_csv(path) quar1 = quar.values qq = np.array(quar1) quar = pd.DataFrame(qq) for p in quar[1]: l = p.split('-') a = int(l[0]) b = int(l[1]) c = int(l[2]) q.append(pd.Timestamp(dt.date( a, b, c)).quarter) # Finding quarter of each date in date field quar['5'] = q quar.to_csv('new_file1.csv', index=False)