Exemplo n.º 1
0
class InvertedIndex:
    def __init__(self, fpath, dump_fpath):
        self.text_processor = TextProcessor()

        self.queries = []
        self.documents = []
        self.original_documents = []
        self.is_duplicates = []

        self.vectorizer = CountVectorizer()
        self.build_index(fpath)

        self.dump(dump_fpath)

    def build_index(self, fpath):
        with open(abspath(fpath), 'r', encoding='utf-8') as file:
            table = csv_reader(file)

            for row in tqdm(list(table)):
                if row[0] == '':
                    continue
                # TODO: сделать полный индекс
                if row[0] == '10000':
                    break

                self.queries.append(self.text_processor.process(row[1]))
                self.documents.append(self.text_processor.process(row[2]))

                self.original_documents.append(row[2])
                self.is_duplicates.append(row[3])

        self.vectorizer.fit_transform(self.queries)

    def dump(self, dump_fpath):
        json_encoded = jp_encode(self)

        with open(dump_fpath, 'w', encoding='utf-8') as file:
            json_dump(json_encoded, file, ensure_ascii=False, indent=4)

    @staticmethod
    def restore(dump_fpath):
        with open(dump_fpath, "r", encoding='utf-8') as file:
            idx_dump = json_load(file)
        return jp_decode(idx_dump)

    @staticmethod
    def from_dump_or_build(dump_fpath, corpora_fpath):
        if isfile(dump_fpath):
            try:
                return InvertedIndex.restore(dump_fpath)
            except Exception:
                return InvertedIndex(corpora_fpath, dump_fpath)
        else:
            return InvertedIndex(corpora_fpath, dump_fpath)
Exemplo n.º 2
0
 def process_files(self):
     files=[f for f in listdir(self.path)]
     files_dic={}
     for file in files:
         #process the file based on the file extension
         file_ext=file.split('.')[-1]
         if file_ext=='txt':
             files_dic[file]=self.process_txt(self.path+file)
         elif file_ext=='pdf':
             files_dic[file]=self.process_pdf(self.path+file)
         elif file_ext=='html':
             files_dic[file]=self.process_html(self.path+file)
     tp=TextProcessor()
     for file, text in files_dic.items():
         #call the text_processor module
         text_proc_result=tp.process(JSONEncoder().encode({'action':'process', 'data':text}))
         text_proc_result=JSONDecoder().decode(text_proc_result)['terms']
         files_dic[file]=text_proc_result
     return files_dic
Exemplo n.º 3
0
 def form_valid(self, form):
     text_processor = TextProcessor()
     form.instance.processed_text = text_processor.process(
         form.cleaned_data['origin_text'])
     return super().form_valid(form)
Exemplo n.º 4
0
def main():
    device = torch.device('cuda')

    embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

    text_processor = TextProcessor(
        wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
        tokenizer=get_tokenizer('basic_english'),
        standardize=True,
        min_len=3,
    )

    dataset = TextDataset(CORPUS_DIR, text_processor)

    # split into training and test set
    # TODO: fix this splitting sometimes failing when corpus size changes
    train_set, test_set = torch.utils.data.random_split(
        dataset, [
            int(len(dataset) * DATA_SPLIT),
            int(len(dataset) * (1.0 - DATA_SPLIT))
        ])

    # count number of samples in each class
    class_count = [0, 0]
    for data, label in dataset:
        class_count[int(label.item())] += 1

    # get relative weights for classes
    _sum = sum(class_count)
    class_count[0] /= _sum
    class_count[1] /= _sum

    # reverse the weights since we're getting the inverse for the sampler
    class_count = list(reversed(class_count))

    # set weight for every sample
    weights = [class_count[int(x[1].item())] for x in train_set]

    # weighted sampler
    sampler = torch.utils.data.WeightedRandomSampler(
        weights=weights, num_samples=len(train_set), replacement=True)

    train_loader = DataLoader(dataset=train_set,
                              batch_size=32,
                              collate_fn=Sequencer(SEQUENCE_LEN),
                              sampler=sampler)

    test_loader = DataLoader(dataset=test_set,
                             batch_size=32,
                             collate_fn=Sequencer(SEQUENCE_LEN))

    # number of filters in each convolutional filter
    N_FILTERS = 64

    # sizes and number of convolutional layers
    FILTER_SIZES = [2, 3]

    # dropout for between conv and dense layers
    DROPOUT = 0.5

    model = TextCNN(
        embeddings=embedding_vectors,
        n_filters=N_FILTERS,
        filter_sizes=FILTER_SIZES,
        dropout=DROPOUT,
    ).to(device)

    print(model)
    print('Trainable params:',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    EPOCHS = 12

    best_acc = 0.0

    # training loop
    for epoch in range(EPOCHS):
        print('Epoch', epoch + 1)

        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            # get word indices vector and corresponding labels
            x, labels = data

            # send to device
            x = x.to(device)
            labels = labels.to(device)

            # make predictions
            predictions = model(x).squeeze()

            # calculate loss
            loss = criterion(predictions, labels)

            # learning stuff...
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # evaluate
        with torch.no_grad():
            model.eval()

            correct = 0
            wrong = 0
            m = [[0, 0], [0, 0]]

            for data in test_loader:
                x, label = data
                x = x.to(device)

                predictions = model(x).squeeze()

                for truth, prediction in zip(label, predictions):
                    y = int(truth.item())
                    y_pred = 1 if prediction.item() > 0.5 else 0

                    m[y][y_pred] += 1

                    if y == y_pred:
                        correct += 1
                    else:
                        wrong += 1

            model.train()

            acc = correct / (correct + wrong)
            if acc > best_acc:
                best_acc = acc
                for file in glob.glob('models/model_*.pth'):
                    os.remove(file)
                torch.save(model.state_dict(), f'models/state_{epoch}.pth')

            print()
            print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc)
            print('[[TN, FP], [FN, TP]]')
            print(m)
            print()

    # put into evaluation mode
    model.eval()

    text_processor.do_standardize = True

    with torch.no_grad():
        while True:
            text = input('Prompt: ')
            x = text_processor.process(text)
            x = torch.tensor(x).unsqueeze(dim=0)
            print(model(x.to(device)).squeeze())
Exemplo n.º 5
0
def process_file(file_id):
    file = File.objects.get(pk=file_id)
    print(file)
    try:
        origin_path = file.origin_file.path
    except ValueError:
        origin_path = None
    file.input_type = get_input_type(file)
    print(file.input_type)
    file.progress += 10
    file.save()

    sleep(.5)
    file.progress += 10
    file.save()


    document = None
    if file.input_type == File.InputTypes.IMAGE:
        document = Document()
        text = image_to_text(origin_path)
    elif file.input_type == File.InputTypes.TEXTBOX:
        text = file.origin_text
    else:
        document = Document(origin_path, text_params)
        text = document.parse()
    # file.progress += 50
    file.progress += 20
    file.save()

    sleep(.5)
    file.progress += 10
    file.save()
    sleep(.5)
    file.progress += 10
    file.save()

    text_processor = TextProcessor()
    processed_text = text_processor.process(text)
    # file.progress += 30
    file.progress += 20
    file.save()

    sleep(.5)
    file.progress += 10
    file.save()

    if file.input_type == File.InputTypes.TEXTBOX:
        file.processed_text = processed_text

    else:
        if document is None:
            raise ValueError('Error with document')
        output_name = get_output_field(file)
        document.change_text(processed_text)
        document.save(file.processed_file.storage.path(output_name))
        file.processed_file = output_name
    sleep(.5)
    file.progress = 100
    file.save()
    print(file)
Exemplo n.º 6
0
def process_words():
    textProcessor = TextProcessor()
    processed_text = textProcessor.process(request.args.get('text'))
    return jsonify(response=processed_text)