def make_references(data):
    references = []

    for i in data:
        ref = i.get('reference')
        text = i.get('text')
        clean = clean_text(text)
        
        R = Reference(ref, clean)
    
        references.append(R)
    
    return references
示例#2
0
def carga_textos(folder,termina):
  try:
    lista_textos=[]#lista de contenidos
    lista_archivos=os.listdir(folder)
    lista_txt=[archivo for archivo in lista_archivos if archivo.endswith(termina)]
    #a leer!
    for archivo in lista_txt:
      texto=lector.leer_archivo(os.path.join(folder,archivo))
      texto_limpio=clean.clean_text(texto)
      lista_textos.append(texto_limpio)
  except IOError as e:
    pirnt(e)
    lista_textos=[]
  return lista_textos
 def get_nlp(self, text, display = False):
     clean_text = clean.clean_text(text)
     doc = self.nlp(clean_text)
     if display:
         displacy.serve(doc, style="dep")
     return doc
示例#4
0
 def _clean_(self, text):
     return clean_text(text)
示例#5
0

def read_datapoints(FILE_PATH: str) -> List[Dict]:
    with open(FILE_PATH) as f:
        reader = csv.DictReader(f,
                                delimiter='\t',
                                fieldnames=['target', 'sms'])

        return [row for row in reader]


if __name__ == "__main__":
    args = read_args()
    data = read_datapoints(args.data_path)
    df = pd.DataFrame(data)

    df['sms'] = df['sms'].apply(lambda x: clean_text(x))
    df['sms_length'] = df['sms'].apply(len)
    df['target'] = df['target'].replace({'ham': True, 'spam': False})

    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)

    kf = StratifiedKFold(n_splits=5)

    for f, (t_, v_) in enumerate(kf.split(X=df, y=df.target)):
        df.loc[v_, 'kfold'] = f

    df.to_csv(os.path.join(args.output_dir, 'cleaned_data.csv'),
              index=False,
              sep='\t')
示例#6
0
import clean

articles = []

def give_prompt():
    print("----------")
    print("Currently stored: %s\nPlease enter each article below, followed by 'end'. Enter 'stop' when finished.\n" % len(articles))

recent_inputs = []

give_prompt()
while True:
    text_input = input("> ") + "\n"
    if text_input.lower() == "save\n":
        with open("output.txt", "w") as outfile:
            for i in range(len(articles)):
                outfile.write("ARTICLE %s of %s -- Ordering: ___ / %s\n\n" % (i, len(articles), len(articles)))
                outfile.write(articles[i])
                outfile.write("\n----------\n\n")
        break
    if text_input.lower() == "end\n":
        article = "".join(recent_inputs)
        cleaned = clean.clean_text(article)
        print(cleaned)
        articles.append(cleaned)
        recent_inputs = []
        give_prompt()
    else:
        recent_inputs.append(text_input)
示例#7
0
def normalize(file_text):
    file_text = clean_html_tags(file_text)
    file_text = clean_text(file_text)
    # file_text = normalize_number(file_text)
    file_text = neologdn.normalize(file_text)
    return file_text